########################################################
## PROGRAM NAME: 002_process_00t20.R                  ##
## AUTHOR: MATT MLECZKO                               ##
## DATE CREATED: 04/26/2021                           ##
## INPUTS:                                            ##
##    001_2020_Census_del.Rda                         ##
##    001_2000_trts_cb.Rda                            ##       
##    001_ltdb_00t10.Rda                              ##
##    001_2010_trts_cb.Rda                            ##
##    001_2020_trts_cb.Rda                            ## 
##    001_ltdb_20t10.Rda                              ##
##    001_2000_counties_cb.Rda                        ##
##    001_2010_counties_cb.Rda                        ##
##    001_2020_counties_cb.Rda                        ##
##                                                    ##
##                                                    ##
## OUTPUTS:                                           ##
##    002_trts_fsc.Rda                                ##
##    002_af_wide_final.Rda                           ##
##    002_af_long_final.Rda                           ##
##    002_agg_msa_2020.Rda                            ##
##                                                    ## 
## PURPOSE: Create main analytic file                 ##
##          (and appendix figure a1)                  ##
##                                                    ##
## LIST OF UPDATES:                                   ##
########################################################

log <- file("002_process_00t20.txt") 
sink(log, append=TRUE)
sink(log, append=TRUE, type="message")

## load libraries ##

library(tidycensus)
library(tigris)
library(tidyverse)
library(foreign)
library(stringr)
library(tm)
library(gdata)
library(readxl)
library(sf)
library(haven)
library(data.table)
library(ggridges)

## define paths ##
data_path <- "PATH TO DATA HERE"
progs <- "PATH TO PROGRAMS HERE"

## set working directory ##
setwd(data_path)

##
## load necessary functions ## 
##

`%notin%` <- Negate(`%in%`)
source(paste0(progs,"stata_merge.R"))
source(paste0(progs,"hud_group.R"))
source(paste0(progs,"rd_data.R"))


## states to loop through ## 
states <- c("AL","AK","AZ","AR","CA","CO","CT","DE",
            "DC","FL","GA","HI","ID","IL","IN","IA","KS",
            "KY","LA","ME","MD","MA","MI","MN","MS","MO",
            "MT","NE","NV","NH","NJ","NM","NY","NC","ND",
            "OH","OK","OR","PA","RI","SC","SD","TN","TX",
            "UT","VT","VA","WA","WV","WI","WY")

## 
## create MSA-FIPS crosswalk ##
## 

load("001_2020_Census_del.Rda")

fips.to.msas <- census.del.2020.final %>%
  filter(`Metropolitan/Micropolitan Statistical Area` == "Metropolitan Statistical Area") %>%
  select(FIPS,
         `CBSA Code`) %>%
  rename(CBSA = `CBSA Code`)

paste("ARE THERE 392 MSAs IN THE CROSSWALK FILE?")
length(unique(fips.to.msas$CBSA)) == 392

keep.msas <- fips.to.msas %>%
  group_by(CBSA) %>%
  summarize(n=n()) %>%
  select(CBSA)

cbsa.names <- census.del.2020.final %>%
  rename(CBSA = `CBSA Code`,
         CBSA_name = `CBSA Title`) %>%
  select(CBSA,
         CBSA_name) %>%
  distinct()

######################
## 2000 Census data ## 
######################

load("001_2000_trts_cb.Rda")

## add a denominator for calculating income groups for later on ##
all.tracts.2000.temp <- all.tracts.2000.cb %>%
  mutate(hhs_inc = rowSums(cbind(aian_hhld_inc,
                                 as_hhld_inc,
                                 bl_hhld_inc,
                                 nhpi_hhld_inc,
                                 hl_hhld_inc,
                                 oth_hhld_inc,
                                 tw_hhld_inc,
                                 nhwh_hhld_inc),na.rm=T))

##
## update GEOIDs to reflect 2010 tract boundaries ##
##

load("001_ltdb_00t10.Rda")

ltdb.00t10.fm <- ltdb.00t10 %>%
  select(trtid00,
         trtid10,
         weight,
         GEOID)

## pre-merge checks ##
paste("CLASS OF GEOID IN 2000 TRACT FILE")
class(all.tracts.2000.temp$GEOID)
paste("GEOID CHECK IN 2000 TRACT FILE")
range(nchar(trim(all.tracts.2000.temp$GEOID)))
paste("IS THE 2000 TRACT FILE UNIQUE BY GEOID?")
nrow(all.tracts.2000.temp) == length(unique(all.tracts.2000.temp$GEOID))

paste("CLASS OF GEOID IN LTDB 00T10")
class(ltdb.00t10.fm$GEOID)
paste("GEOID CHECK IN LTDB 00T10")
range(nchar(trim(ltdb.00t10.fm$GEOID)))

## merge the files ## 
all.tracts.2000.match.m <- stata.merge(all.tracts.2000.temp,
                                       ltdb.00t10.fm,
                                       "GEOID")

## diagnose merge ## 
paste("MERGE OF 2000 TRACT FILE AND LTDB 00T10")
table(all.tracts.2000.match.m$merge.variable)

## check: all the non-matching obs have 0 population ##
nm1.2000 <- all.tracts.2000.match.m %>%
  filter(merge.variable == 1) 

paste("DO ALL NON-MATCHING OBSERVATIONS HAVE 0 POPULATION?")
sum(nm1.2000$pop_total == 0) == nrow(nm1.2000)

## keep only the matches and obs with non-zero weights ##
all.tracts.2000.match <- all.tracts.2000.match.m %>%
  filter(merge.variable == 3 &
         weight != 0) %>%
  select(-merge.variable)

all.tracts.2000.nowt <- all.tracts.2000.match.m %>%
  filter(merge.variable == 3 & weight == 0)

## change the GEOID ##
## some 2010 tract IDs are duplicated because of  ##
## tracts that changed from 2000 to 2010; these   ##
## tracts will be condensed into one observation  ##
## via a weighted average, with weights from LTDB ##

all.tracts.2000.pr1a <- all.tracts.2000.match %>%
  select(GEOID,
         trtid00,
         trtid10, 
         everything()) %>%
  select(-median_hhld_inc) %>%
  relocate(r0, .after = last_col()) %>%
  mutate_at(vars(pop_total:hhs_inc),
            .funs = funs(. * weight))
  
all.tracts.2000.pr2a <- all.tracts.2000.pr1a %>% 
  group_by(trtid10) %>%
  summarize_at(vars(pop_total:hhs_inc),
               funs(sum(., na.rm=T))) %>%
  mutate(gq_per = gq_tot/pop_total,
         gq_per_alt = gq_num/gq_den) %>%
  rename(GEOID = trtid10)

## verify that both group quarter definitions are the same ##
gq.comp.2000 <- all.tracts.2000.pr2a %>%
  select(GEOID,
         gq_per,
         gq_per_alt) %>%
  mutate(comp = gq_per - gq_per_alt)

paste("COMPARISON OF GQ DEFINITIONS IN 2000")
summary(gq.comp.2000$comp)

##
## separate process for median hhld inc ## 
##

all.tracts.2000.pr2b <- all.tracts.2000.match %>%
  select(GEOID,
         trtid00,
         trtid10, 
         median_hhld_inc,
         weight) %>%
  group_by(trtid10) %>%
  summarize_at(vars(median_hhld_inc),
               funs(weighted.mean(., weight, na.rm=T))) %>%
  rename(GEOID = trtid10)

## check obs ## 
paste("OBS IN 2000 TRACT FILE A")
nrow(all.tracts.2000.pr2a)
paste("OBS IN 2000 TRACT FILE B")
nrow(all.tracts.2000.pr2b)

## merge the files ##
all.tracts.2000.rf.m <-  stata.merge(all.tracts.2000.pr2a,
                                     all.tracts.2000.pr2b, 
                                     "GEOID")

## check merge ##
paste("MERGE 2000 TRACT FILES A AND B")
table(all.tracts.2000.rf.m$merge.variable, useNA = "ifany")

##
## keep matches and make sample restrictions ##
##

all.tracts.2000.rf <- all.tracts.2000.rf.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable) 

all.tracts.2000.rd <- all.tracts.2000.rf %>%
  filter(pop_total >= 500 & 
         hhs > 10 & 
         hhs_inc > 0 &
         gq_per <= 0.5)

paste("OBS IN 2000 TRACT FILE AFTER SAMPLE RESTRICTIONS")
nrow(all.tracts.2000.rd)

##
## condense r0 separately ##
##

all.tracts.2000.r0 <- all.tracts.2000.pr1a %>%
  filter(trtid10 %in% all.tracts.2000.pr2a$GEOID) %>%
  select(GEOID,
         trtid10,
         pop_total,
         r0) %>%
  mutate(r0_new = pop_total*r0)

## checks ## 
paste("OBS IN 2000 r0 FILE")
nrow(all.tracts.2000.r0)
paste("UNIQUE GEOIDS IN 2000 r0 FILE")
length(unique(all.tracts.2000.r0$trtid10))

all.tracts.2000.r0.final <- all.tracts.2000.r0 %>%
  group_by(trtid10) %>%
  summarize(num = sum(r0_new, na.rm=T),
            den = sum(pop_total, na.rm=T),
            r0 = num/den) %>%
  rename(GEOID = trtid10) %>% 
  select(GEOID,
         r0)

paste("FINAL OBS IN 2000 r0 FILE")
nrow(all.tracts.2000.r0.final)

##
## merge on correct r0 (for calculating rank H) ##
##

all.tracts.2000.pr3.m <- stata.merge(all.tracts.2000.rd,
                                     all.tracts.2000.r0.final,
                                     "GEOID")

## check merge ## 
paste("MERGE WITH 2000 TRACT FILE AND 2000 r0 FILE")
table(all.tracts.2000.pr3.m$merge.variable, useNA = "ifany")

##
## now, limit census tracts to metro areas ## 
##

## prepare 2000 Census tracts for merge ## 
all.tracts.2000.pr3 <- all.tracts.2000.pr3.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable) %>%
  mutate(FIPS = substr(GEOID,1,5))

## pre-merge checks ##
paste("CLASS of FIPS in 2000 TRACT FILE")
class(all.tracts.2000.pr3$FIPS)
paste("FIPS CHECK IN 2000 TRACT FILE")
range(nchar(trim(all.tracts.2000.pr3$FIPS)))
paste("IS 2000 TRACT FILE UNIQUE BY FIPS?")
nrow(all.tracts.2000.pr3) == length(unique(all.tracts.2000.pr3$FIPS))

paste("CLASS OF FIPS IN FIPS TO MSA XWALK")
class(fips.to.msas$FIPS)
paste("FIPS CHECK IN FIPS TO MSA XWALK")
range(nchar(trim(fips.to.msas$FIPS)))
paste("IS FIPS TO MSA XWALK UNIQUE BY FIPS?")
nrow(fips.to.msas) == length(unique(fips.to.msas$FIPS))

##
## merge the files ## 
##

metro.tracts.2000.m <- stata.merge(all.tracts.2000.pr3,
                                   fips.to.msas,
                                   "FIPS")

## diagnose merge ##
## check: the non-matching obs from fips.to.msas (2) are all in Puerto Rico ##
paste("MERGE BETWEEN 2000 TRACT FILE AND FIPS TO MSA XWALK")
table(metro.tracts.2000.m$merge.variable, useNA = "ifany")

nm2.metro.2000 <- metro.tracts.2000.m %>%
  filter(merge.variable == 2)

paste("ARE ALL NON-MATCHING OBS IN PUERTO RICO?")
sum(substr(nm2.metro.2000$FIPS,1,2) == "72") == nrow(nm2.metro.2000)

## keep only metro tracts ## 
metro.tracts.2000 <- metro.tracts.2000.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable)

## how many CBSAs? ##
paste("CBSAs IN 2000 TRACT FILE")
length(unique(metro.tracts.2000$CBSA))

##
## final processing ## 
##

metro.tracts.2000.int <- metro.tracts.2000 %>%
  mutate(year = 2000,
         per_fb = ifelse(foreign_den >0, foreign_born/foreign_den, 0),
         hhld_pov_rt = ifelse(hhld_pov_den > 0, hhld_pov_num/hhld_pov_den, 0),
         phi1 = hhld_inc1/hhld_inc,
         phi2 = hhld_inc2/hhld_inc,
         phi3 = hhld_inc3/hhld_inc,
         phi4 = hhld_inc4/hhld_inc,
         phi5 = hhld_inc5/hhld_inc,
         phi6 = hhld_inc6/hhld_inc,
         phi7 = hhld_inc7/hhld_inc,
         phi8 = hhld_inc8/hhld_inc,
         phi9 = hhld_inc9/hhld_inc,
         phi10 = hhld_inc10/hhld_inc,
         phi11 = hhld_inc11/hhld_inc,
         phi12 = hhld_inc12/hhld_inc,
         phi13 = hhld_inc13/hhld_inc,
         phi14 = hhld_inc14/hhld_inc,
         phi15 = hhld_inc15/hhld_inc,
         phi16 = hhld_inc16/hhld_inc,
         as_phi1 = as_hhld_inc1/hhs_inc,
         as_phi2 = as_hhld_inc2/hhs_inc,
         as_phi3 = as_hhld_inc3/hhs_inc,
         as_phi4 = as_hhld_inc4/hhs_inc,
         as_phi5 = as_hhld_inc5/hhs_inc,
         as_phi6 = as_hhld_inc6/hhs_inc,
         as_phi7 = as_hhld_inc7/hhs_inc,
         as_phi8 = as_hhld_inc8/hhs_inc,
         as_phi9 = as_hhld_inc9/hhs_inc,
         as_phi10 = as_hhld_inc10/hhs_inc,
         as_phi11 = as_hhld_inc11/hhs_inc,
         as_phi12 = as_hhld_inc12/hhs_inc,
         as_phi13 = as_hhld_inc13/hhs_inc,
         as_phi14 = as_hhld_inc14/hhs_inc,
         as_phi15 = as_hhld_inc15/hhs_inc,
         as_phi16 = as_hhld_inc16/hhs_inc,
         bl_phi1 = bl_hhld_inc1/hhs_inc,
         bl_phi2 = bl_hhld_inc2/hhs_inc,
         bl_phi3 = bl_hhld_inc3/hhs_inc,
         bl_phi4 = bl_hhld_inc4/hhs_inc,
         bl_phi5 = bl_hhld_inc5/hhs_inc,
         bl_phi6 = bl_hhld_inc6/hhs_inc,
         bl_phi7 = bl_hhld_inc7/hhs_inc,
         bl_phi8 = bl_hhld_inc8/hhs_inc,
         bl_phi9 = bl_hhld_inc9/hhs_inc,
         bl_phi10 = bl_hhld_inc10/hhs_inc,
         bl_phi11 = bl_hhld_inc11/hhs_inc,
         bl_phi12 = bl_hhld_inc12/hhs_inc,
         bl_phi13 = bl_hhld_inc13/hhs_inc,
         bl_phi14 = bl_hhld_inc14/hhs_inc,
         bl_phi15 = bl_hhld_inc15/hhs_inc,
         bl_phi16 = bl_hhld_inc16/hhs_inc,
         hl_phi1 = hl_hhld_inc1/hhs_inc,
         hl_phi2 = hl_hhld_inc2/hhs_inc,
         hl_phi3 = hl_hhld_inc3/hhs_inc,
         hl_phi4 = hl_hhld_inc4/hhs_inc,
         hl_phi5 = hl_hhld_inc5/hhs_inc,
         hl_phi6 = hl_hhld_inc6/hhs_inc,
         hl_phi7 = hl_hhld_inc7/hhs_inc,
         hl_phi8 = hl_hhld_inc8/hhs_inc,
         hl_phi9 = hl_hhld_inc9/hhs_inc,
         hl_phi10 = hl_hhld_inc10/hhs_inc,
         hl_phi11 = hl_hhld_inc11/hhs_inc,
         hl_phi12 = hl_hhld_inc12/hhs_inc,
         hl_phi13 = hl_hhld_inc13/hhs_inc,
         hl_phi14 = hl_hhld_inc14/hhs_inc,
         hl_phi15 = hl_hhld_inc15/hhs_inc,
         hl_phi16 = hl_hhld_inc16/hhs_inc,
         nhwh_phi1 = nhwh_hhld_inc1/hhs_inc,
         nhwh_phi2 = nhwh_hhld_inc2/hhs_inc,
         nhwh_phi3 = nhwh_hhld_inc3/hhs_inc,
         nhwh_phi4 = nhwh_hhld_inc4/hhs_inc,
         nhwh_phi5 = nhwh_hhld_inc5/hhs_inc,
         nhwh_phi6 = nhwh_hhld_inc6/hhs_inc,
         nhwh_phi7 = nhwh_hhld_inc7/hhs_inc,
         nhwh_phi8 = nhwh_hhld_inc8/hhs_inc,
         nhwh_phi9 = nhwh_hhld_inc9/hhs_inc,
         nhwh_phi10 = nhwh_hhld_inc10/hhs_inc,
         nhwh_phi11 = nhwh_hhld_inc11/hhs_inc,
         nhwh_phi12 = nhwh_hhld_inc12/hhs_inc,
         nhwh_phi13 = nhwh_hhld_inc13/hhs_inc,
         nhwh_phi14 = nhwh_hhld_inc14/hhs_inc,
         nhwh_phi15 = nhwh_hhld_inc15/hhs_inc,
         nhwh_phi16 = nhwh_hhld_inc16/hhs_inc,
         oth_phi1 = rowSums(cbind(oth_hhld_inc1,nhpi_hhld_inc1,aian_hhld_inc1,tw_hhld_inc1),na.rm=T)/hhs_inc,
         oth_phi2 = rowSums(cbind(oth_hhld_inc2,nhpi_hhld_inc2,aian_hhld_inc2,tw_hhld_inc2),na.rm=T)/hhs_inc,
         oth_phi3 = rowSums(cbind(oth_hhld_inc3,nhpi_hhld_inc3,aian_hhld_inc3,tw_hhld_inc3),na.rm=T)/hhs_inc,
         oth_phi4 = rowSums(cbind(oth_hhld_inc4,nhpi_hhld_inc4,aian_hhld_inc4,tw_hhld_inc4),na.rm=T)/hhs_inc,
         oth_phi5 = rowSums(cbind(oth_hhld_inc5,nhpi_hhld_inc5, aian_hhld_inc5,tw_hhld_inc5),na.rm=T)/hhs_inc,
         oth_phi6 = rowSums(cbind(oth_hhld_inc6,nhpi_hhld_inc6,aian_hhld_inc6,tw_hhld_inc6),na.rm=T)/hhs_inc,
         oth_phi7 = rowSums(cbind(oth_hhld_inc7,nhpi_hhld_inc7,aian_hhld_inc7,tw_hhld_inc7),na.rm=T)/hhs_inc,
         oth_phi8 = rowSums(cbind(oth_hhld_inc8,nhpi_hhld_inc8,aian_hhld_inc8,tw_hhld_inc8),na.rm=T)/hhs_inc,
         oth_phi9 = rowSums(cbind(oth_hhld_inc9,nhpi_hhld_inc9,aian_hhld_inc9,tw_hhld_inc9),na.rm=T)/hhs_inc,
         oth_phi10 = rowSums(cbind(oth_hhld_inc10,nhpi_hhld_inc10,aian_hhld_inc10,tw_hhld_inc10),na.rm=T)/hhs_inc,
         oth_phi11 = rowSums(cbind(oth_hhld_inc11,nhpi_hhld_inc11,aian_hhld_inc11,tw_hhld_inc11),na.rm=T)/hhs_inc,
         oth_phi12 = rowSums(cbind(oth_hhld_inc12,nhpi_hhld_inc12,aian_hhld_inc12,tw_hhld_inc12),na.rm=T)/hhs_inc,
         oth_phi13 = rowSums(cbind(oth_hhld_inc13,nhpi_hhld_inc13,aian_hhld_inc13,tw_hhld_inc13),na.rm=T)/hhs_inc,
         oth_phi14 = rowSums(cbind(oth_hhld_inc14,nhpi_hhld_inc14,aian_hhld_inc14,tw_hhld_inc14),na.rm=T)/hhs_inc,
         oth_phi15 = rowSums(cbind(oth_hhld_inc15,nhpi_hhld_inc15,aian_hhld_inc15,tw_hhld_inc15),na.rm=T)/hhs_inc,
         oth_phi16 = rowSums(cbind(oth_hhld_inc16,nhpi_hhld_inc16,aian_hhld_inc16,tw_hhld_inc16),na.rm=T)/hhs_inc,
         per_asian = case_when(pop_total != 0 ~ pop_nh_asian/pop_total,
                               pop_total == 0 ~ 0),
         per_black = case_when(pop_total != 0 ~ pop_nh_black/pop_total,
                               pop_total == 0 ~ 0),
         per_hl = case_when(pop_total != 0 ~ pop_hl/pop_total,
                            pop_total == 0 ~ 0),
         per_white = case_when(pop_total != 0 ~ pop_nh_white/pop_total,
                               pop_total == 0 ~ 0),
         per_aian = case_when(pop_total != 0 ~ pop_nh_aian/pop_total, 
                              pop_total == 0 ~ 0),
         per_other = case_when(pop_total != 0 ~ rowSums(cbind(pop_nh_other, pop_nh_nhpi,pop_nh_multi),na.rm=T)/pop_total, 
                               pop_total == 0 ~ 0),
         log_asian = case_when(pop_nh_asian != 0 ~ log(1/per_asian),
                               pop_nh_asian == 0 ~ 0),
         log_black = case_when(pop_nh_black != 0 ~ log(1/per_black),
                               pop_nh_black == 0 ~ 0),
         log_hl = case_when(pop_hl != 0 ~ log(1/per_hl),
                                pop_hl == 0 ~ 0),
         log_white = case_when(pop_nh_white != 0 ~ log(1/per_white),
                               pop_nh_white == 0 ~ 0),
         log_aian = case_when(pop_nh_aian != 0 ~ log(1/per_aian),
                              pop_nh_aian == 0 ~ 0),
         log_other = case_when(rowSums(cbind(pop_nh_other,pop_nh_nhpi,pop_nh_multi),na.rm=T) != 0 ~ log(1/per_other),
                               rowSums(cbind(pop_nh_other,pop_nh_nhpi,pop_nh_multi),na.rm=T) == 0 ~ 0),
         trt_entropy_er = per_asian*log_asian + 
                          per_black*log_black + 
                          per_hl*log_hl + 
                          per_white*log_white +
                          per_aian*log_aian +
                          per_other*log_other,
         trt_entropy_ers = (trt_entropy_er - min(trt_entropy_er))/(max(trt_entropy_er)-min(trt_entropy_er)),
         hd_er = ifelse(trt_entropy_ers >= 0.7414 &
                        per_asian < 0.45 & 
                        per_black < 0.45 & 
                        per_hl < 0.45 & 
                        per_white < 0.45 & 
                        per_aian < 0.45 &
                        per_other < 0.45, 1, 0),
         ld_er = ifelse(trt_entropy_ers <= 0.3707 |
                        per_asian > 0.8 |
                        per_black > 0.8 |
                        per_hl > 0.8 |
                        per_white > 0.8 |
                        per_aian > 0.8 |
                        per_other > 0.8, 1, 0),
         md_er = ifelse(hd_er == 0 & 
                        ld_er == 0, 1, 0)) %>%
  rename(pop_nh_aian = pop_nh_aian) %>%
  select(GEOID,
         CBSA,
         pop_total, 
         pop_nh,
         pop_nh_white,
         pop_nh_black,
         pop_nh_aian,
         pop_nh_asian,
         pop_nh_nhpi,
         pop_nh_other,
         pop_nh_multi,
         pop_hl,
         gq_per,
         hhs,
         per_fb,
         foreign_born,
         foreign_den,
         hhld_pov_rt,
         hhld_pov_num,
         hhld_pov_den,
         median_hhld_inc,
         starts_with("phi"),
         starts_with("as_phi"),
         starts_with("bl_phi"),
         starts_with("hl_phi"),
         starts_with("nhwh_phi"),
         starts_with("oth_phi"),
         starts_with("per_"),
         starts_with("log_"),
         trt_entropy_er,
         trt_entropy_ers,
         ends_with("_er"))

## check missing ##
paste("MISSINGS IN THE 2000 TRACT FILE")
sapply(metro.tracts.2000.int, function(x) sum(is.na(x)))

##
## determine majority racial group ##
##

metro.tracts.2000.int$maj_race <- c(colnames(metro.tracts.2000.int[,c("per_asian",
                                                                      "per_black",
                                                                      "per_hl",
                                                                      "per_white",
                                                                      "per_aian",
                                                                      "per_other")])[apply(metro.tracts.2000.int[,c("per_asian",
                                                                                                                     "per_black",
                                                                                                                     "per_hl",
                                                                                                                     "per_white",
                                                                                                                     "per_aian",
                                                                                                                     "per_other")],
                                                                                                                     1,which.max)])


## correct GEOIDs according to Census guidance ## 
## https://www2.census.gov/geo/pdfs/reference/Geography_Notes.pdf ##

## Pima County ## 
metro.tracts.2000.int$GEOID[metro.tracts.2000.int$GEOID == "04019002701"] <- "04019002704"
metro.tracts.2000.int$GEOID[metro.tracts.2000.int$GEOID == "04019002903"] <- "04019002906"
metro.tracts.2000.int$GEOID[metro.tracts.2000.int$GEOID == "04019410501"] <- "04019004118"
metro.tracts.2000.int$GEOID[metro.tracts.2000.int$GEOID == "04019410502"] <- "04019004121"
metro.tracts.2000.int$GEOID[metro.tracts.2000.int$GEOID == "04019410503"] <- "04019004125"
metro.tracts.2000.int$GEOID[metro.tracts.2000.int$GEOID == "04019470400"] <- "04019005200"
metro.tracts.2000.int$GEOID[metro.tracts.2000.int$GEOID == "04019470500"] <- "04019005300"

## LA County ##
metro.tracts.2000.int$GEOID[metro.tracts.2000.int$GEOID == "06037930401"] <- "06037137000"

## NY ## 
metro.tracts.2000.int$GEOID[metro.tracts.2000.int$GEOID == "36053940101"] <- "36053030101"
metro.tracts.2000.int$GEOID[metro.tracts.2000.int$GEOID == "36053940102"] <- "36053030102"
metro.tracts.2000.int$GEOID[metro.tracts.2000.int$GEOID == "36053940103"] <- "36053030103"
metro.tracts.2000.int$GEOID[metro.tracts.2000.int$GEOID == "36053940200"] <- "36053030200"
metro.tracts.2000.int$GEOID[metro.tracts.2000.int$GEOID == "36053940300"] <- "36053030300"
metro.tracts.2000.int$GEOID[metro.tracts.2000.int$GEOID == "36053940401"] <- "36053030401"
metro.tracts.2000.int$GEOID[metro.tracts.2000.int$GEOID == "36053940403"] <- "36053030403"
metro.tracts.2000.int$GEOID[metro.tracts.2000.int$GEOID == "36053940600"] <- "36053030600"
metro.tracts.2000.int$GEOID[metro.tracts.2000.int$GEOID == "36053940700"] <- "36053030402"
metro.tracts.2000.int$GEOID[metro.tracts.2000.int$GEOID == "36065940000"] <- "36065024800"
metro.tracts.2000.int$GEOID[metro.tracts.2000.int$GEOID == "36065940100"] <- "36065024700"
metro.tracts.2000.int$GEOID[metro.tracts.2000.int$GEOID == "36065940200"] <- "36065024900"

## save file ##

#save(metro.tracts.2000.int,
#     file= paste(data_path, "/003_2000_trts_int.Rda", sep=""))


######################
## 2010 Census data ## 
######################

load("001_2010_trts_cb.Rda")

##
## add a denominator for calculating income groups for later on ##
##

all.tracts.2010.temp <- all.tracts.2010.cb %>%
  mutate(gq_per = gq_tot/pop_total,
         gq_per_alt = gq_num/gq_den,
         hhs_inc = rowSums(cbind(aian_hhld_inc,
                                 as_hhld_inc,
                                 bl_hhld_inc,
                                 nhpi_hhld_inc,
                                 hl_hhld_inc,
                                 oth_hhld_inc,
                                 tw_hhld_inc,
                                 nhwh_hhld_inc),na.rm=T))

## verify that both group quarter definitions are the same ##

gq.comp.2010 <- all.tracts.2010.temp %>%
  select(GEOID,
         pop_total,
         gq_per,
         gq_per_alt) %>%
  mutate(comp = gq_per - gq_per_alt)

paste("SUMMARY OF DIFFERENCES IN GQ DEFINITIONS IN 2010 TRACT FILE")
summary(gq.comp.2010$comp)
summary(gq.comp.2010$comp[gq.comp.2010$pop_total > 0])

##
##  implement sample restrictions ##
##

all.tracts.2010.rd <- all.tracts.2010.temp %>%
  filter(pop_total >= 500 & 
         hhs > 10 & 
         hhs_inc > 0 & 
         gq_per <= 0.5) 

## check obs ##
paste("OBS IN 2010 TRACT FILE AFTER SAMPLE RESTRICTIONS")
nrow(all.tracts.2010.rd)

##
## limit to metro tracts ##
##

paste("CLASS OF GEOID IN 2010 TRACT FILE")
class(all.tracts.2010.rd$GEOID)
paste("GEOID CHECK IN 2010 TRACT FILE")
range(nchar(trim(all.tracts.2010.rd$GEOID)))
all.tracts.2010.rd$FIPS <- substr(all.tracts.2010.rd$GEOID,1,5)

## pre-merge checks ##
paste("CLASS OF FIPS IN 2010 TRACT FILE")
class(all.tracts.2010.rd$FIPS)
paste("FIPS CHECK IN 2010 TRACT FILE")
range(nchar(trim(all.tracts.2010.rd$FIPS)))
paste("IS 2010 TRACT FILE UNIQUE BY FIPS?")
nrow(all.tracts.2010.rd) == length(unique(all.tracts.2010.rd$FIPS))

paste("CLASS OF FIPS IN FIPS TO MSA XWALK")
class(fips.to.msas$FIPS)
paste("FIPS CHECK IN FIPS TO MSA XWALK")
range(nchar(trim(fips.to.msas$FIPS)))
paste("IS FIPS TO MSA XWALK UNIQUE BY FIPS?")
nrow(fips.to.msas) == length(unique(fips.to.msas$FIPS))

##
## merge files ## 
##

metro.tracts.2010.m <- stata.merge(all.tracts.2010.rd,
                                   fips.to.msas,
                                   "FIPS")

## diagnose merge ##
## check: the non-matching obs from fips.to.msas (2) are all in Puerto Rico ##
paste("MERGE BETWEEN 2010 TRACT FILE AND FIPS TO MSA XWALK")
table(metro.tracts.2010.m$merge.variable, useNA = "ifany")

nm2.metro.2010 <- metro.tracts.2010.m %>%
  filter(merge.variable == 2)

paste("ARE ALL NON-MATCHING OBS IN PUERTO RICO?")
sum(substr(nm2.metro.2010$FIPS,1,2) == "72") == nrow(nm2.metro.2010)

## keep only metro tracts ## 
metro.tracts.2010 <- metro.tracts.2010.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable)

##
## final processing ## 
##

metro.tracts.2010.int <- metro.tracts.2010 %>%
  mutate(year = 2010,
         per_fb = ifelse(foreign_den > 0, foreign_born/foreign_den, 0),
         hhld_pov_rt = ifelse(hhld_pov_den > 0, hhld_pov_num/hhld_pov_den, 0),
         phi1 = hhld_inc1/hhld_inc,
         phi2 = hhld_inc2/hhld_inc,
         phi3 = hhld_inc3/hhld_inc,
         phi4 = hhld_inc4/hhld_inc,
         phi5 = hhld_inc5/hhld_inc,
         phi6 = hhld_inc6/hhld_inc,
         phi7 = hhld_inc7/hhld_inc,
         phi8 = hhld_inc8/hhld_inc,
         phi9 = hhld_inc9/hhld_inc,
         phi10 = hhld_inc10/hhld_inc,
         phi11 = hhld_inc11/hhld_inc,
         phi12 = hhld_inc12/hhld_inc,
         phi13 = hhld_inc13/hhld_inc,
         phi14 = hhld_inc14/hhld_inc,
         phi15 = hhld_inc15/hhld_inc,
         phi16 = hhld_inc16/hhld_inc,
         as_phi1 = as_hhld_inc1/hhs_inc,
         as_phi2 = as_hhld_inc2/hhs_inc,
         as_phi3 = as_hhld_inc3/hhs_inc,
         as_phi4 = as_hhld_inc4/hhs_inc,
         as_phi5 = as_hhld_inc5/hhs_inc,
         as_phi6 = as_hhld_inc6/hhs_inc,
         as_phi7 = as_hhld_inc7/hhs_inc,
         as_phi8 = as_hhld_inc8/hhs_inc,
         as_phi9 = as_hhld_inc9/hhs_inc,
         as_phi10 = as_hhld_inc10/hhs_inc,
         as_phi11 = as_hhld_inc11/hhs_inc,
         as_phi12 = as_hhld_inc12/hhs_inc,
         as_phi13 = as_hhld_inc13/hhs_inc,
         as_phi14 = as_hhld_inc14/hhs_inc,
         as_phi15 = as_hhld_inc15/hhs_inc,
         as_phi16 = as_hhld_inc16/hhs_inc,
         bl_phi1 = bl_hhld_inc1/hhs_inc,
         bl_phi2 = bl_hhld_inc2/hhs_inc,
         bl_phi3 = bl_hhld_inc3/hhs_inc,
         bl_phi4 = bl_hhld_inc4/hhs_inc,
         bl_phi5 = bl_hhld_inc5/hhs_inc,
         bl_phi6 = bl_hhld_inc6/hhs_inc,
         bl_phi7 = bl_hhld_inc7/hhs_inc,
         bl_phi8 = bl_hhld_inc8/hhs_inc,
         bl_phi9 = bl_hhld_inc9/hhs_inc,
         bl_phi10 = bl_hhld_inc10/hhs_inc,
         bl_phi11 = bl_hhld_inc11/hhs_inc,
         bl_phi12 = bl_hhld_inc12/hhs_inc,
         bl_phi13 = bl_hhld_inc13/hhs_inc,
         bl_phi14 = bl_hhld_inc14/hhs_inc,
         bl_phi15 = bl_hhld_inc15/hhs_inc,
         bl_phi16 = bl_hhld_inc16/hhs_inc,
         hl_phi1 = hl_hhld_inc1/hhs_inc,
         hl_phi2 = hl_hhld_inc2/hhs_inc,
         hl_phi3 = hl_hhld_inc3/hhs_inc,
         hl_phi4 = hl_hhld_inc4/hhs_inc,
         hl_phi5 = hl_hhld_inc5/hhs_inc,
         hl_phi6 = hl_hhld_inc6/hhs_inc,
         hl_phi7 = hl_hhld_inc7/hhs_inc,
         hl_phi8 = hl_hhld_inc8/hhs_inc,
         hl_phi9 = hl_hhld_inc9/hhs_inc,
         hl_phi10 = hl_hhld_inc10/hhs_inc,
         hl_phi11 = hl_hhld_inc11/hhs_inc,
         hl_phi12 = hl_hhld_inc12/hhs_inc,
         hl_phi13 = hl_hhld_inc13/hhs_inc,
         hl_phi14 = hl_hhld_inc14/hhs_inc,
         hl_phi15 = hl_hhld_inc15/hhs_inc,
         hl_phi16 = hl_hhld_inc16/hhs_inc,
         nhwh_phi1 = nhwh_hhld_inc1/hhs_inc,
         nhwh_phi2 = nhwh_hhld_inc2/hhs_inc,
         nhwh_phi3 = nhwh_hhld_inc3/hhs_inc,
         nhwh_phi4 = nhwh_hhld_inc4/hhs_inc,
         nhwh_phi5 = nhwh_hhld_inc5/hhs_inc,
         nhwh_phi6 = nhwh_hhld_inc6/hhs_inc,
         nhwh_phi7 = nhwh_hhld_inc7/hhs_inc,
         nhwh_phi8 = nhwh_hhld_inc8/hhs_inc,
         nhwh_phi9 = nhwh_hhld_inc9/hhs_inc,
         nhwh_phi10 = nhwh_hhld_inc10/hhs_inc,
         nhwh_phi11 = nhwh_hhld_inc11/hhs_inc,
         nhwh_phi12 = nhwh_hhld_inc12/hhs_inc,
         nhwh_phi13 = nhwh_hhld_inc13/hhs_inc,
         nhwh_phi14 = nhwh_hhld_inc14/hhs_inc,
         nhwh_phi15 = nhwh_hhld_inc15/hhs_inc,
         nhwh_phi16 = nhwh_hhld_inc16/hhs_inc,
         oth_phi1 = rowSums(cbind(oth_hhld_inc1, nhpi_hhld_inc1, aian_hhld_inc1, tw_hhld_inc1),na.rm=T)/hhs_inc,
         oth_phi2 = rowSums(cbind(oth_hhld_inc2 , nhpi_hhld_inc2 , aian_hhld_inc2 , tw_hhld_inc2),na.rm=T)/hhs_inc,
         oth_phi3 = rowSums(cbind(oth_hhld_inc3 , nhpi_hhld_inc3 , aian_hhld_inc3 , tw_hhld_inc3),na.rm=T)/hhs_inc,
         oth_phi4 = rowSums(cbind(oth_hhld_inc4 , nhpi_hhld_inc4 , aian_hhld_inc4 , tw_hhld_inc4),na.rm=T)/hhs_inc,
         oth_phi5 = rowSums(cbind(oth_hhld_inc5 , nhpi_hhld_inc5 , aian_hhld_inc5 , tw_hhld_inc5),na.rm=T)/hhs_inc,
         oth_phi6 = rowSums(cbind(oth_hhld_inc6 , nhpi_hhld_inc6 , aian_hhld_inc6 , tw_hhld_inc6),na.rm=T)/hhs_inc,
         oth_phi7 = rowSums(cbind(oth_hhld_inc7 , nhpi_hhld_inc7 , aian_hhld_inc7 , tw_hhld_inc7),na.rm=T)/hhs_inc,
         oth_phi8 = rowSums(cbind(oth_hhld_inc8 , nhpi_hhld_inc8 , aian_hhld_inc8 , tw_hhld_inc8),na.rm=T)/hhs_inc,
         oth_phi9 = rowSums(cbind(oth_hhld_inc9 , nhpi_hhld_inc9 , aian_hhld_inc9 , tw_hhld_inc9),na.rm=T)/hhs_inc,
         oth_phi10 = rowSums(cbind(oth_hhld_inc10 , nhpi_hhld_inc10 , aian_hhld_inc10 , tw_hhld_inc10),na.rm=T)/hhs_inc,
         oth_phi11 = rowSums(cbind(oth_hhld_inc11 , nhpi_hhld_inc11 , aian_hhld_inc11 , tw_hhld_inc11),na.rm=T)/hhs_inc,
         oth_phi12 = rowSums(cbind(oth_hhld_inc12 , nhpi_hhld_inc12 , aian_hhld_inc12 , tw_hhld_inc12),na.rm=T)/hhs_inc,
         oth_phi13 = rowSums(cbind(oth_hhld_inc13 , nhpi_hhld_inc13 , aian_hhld_inc13 , tw_hhld_inc13),na.rm=T)/hhs_inc,
         oth_phi14 = rowSums(cbind(oth_hhld_inc14 , nhpi_hhld_inc14 , aian_hhld_inc14 , tw_hhld_inc14),na.rm=T)/hhs_inc,
         oth_phi15 = rowSums(cbind(oth_hhld_inc15 , nhpi_hhld_inc15 , aian_hhld_inc15 , tw_hhld_inc15),na.rm=T)/hhs_inc,
         oth_phi16 = rowSums(cbind(oth_hhld_inc16 , nhpi_hhld_inc16 , aian_hhld_inc16 , tw_hhld_inc16),na.rm=T)/hhs_inc,
         per_asian = case_when(pop_total != 0 ~ pop_nh_asian/pop_total,
                               pop_total == 0 ~ 0),
         per_black = case_when(pop_total != 0 ~ pop_nh_black/pop_total,
                               pop_total == 0 ~ 0),
         per_hl = case_when(pop_total != 0 ~ pop_hl/pop_total,
                                pop_total == 0 ~ 0),
         per_white = case_when(pop_total != 0 ~ pop_nh_white/pop_total,
                               pop_total == 0 ~ 0),
         per_aian = case_when(pop_total != 0 ~ pop_nh_aian/pop_total, 
                              pop_total == 0 ~ 0),
         per_other = case_when(pop_total != 0 ~ rowSums(cbind(pop_nh_other,pop_nh_nhpi,pop_nh_multi),na.rm=T)/pop_total, 
                               pop_total == 0 ~ 0),
         log_asian = case_when(pop_nh_asian != 0 ~ log(1/per_asian),
                               pop_nh_asian == 0 ~ 0),
         log_black = case_when(pop_nh_black != 0 ~ log(1/per_black),
                               pop_nh_black == 0 ~ 0),
         log_hl = case_when(pop_hl != 0 ~ log(1/per_hl),
                                pop_hl == 0 ~ 0),
         log_white = case_when(pop_nh_white != 0 ~ log(1/per_white),
                               pop_nh_white == 0 ~ 0),
         log_aian = case_when(pop_nh_aian != 0 ~ log(1/per_aian),
                              pop_nh_aian == 0 ~ 0),
         log_other = case_when(rowSums(cbind(pop_nh_other,pop_nh_nhpi, pop_nh_multi),na.rm=T) != 0 ~ log(1/per_other),
                               rowSums(cbind(pop_nh_other,pop_nh_nhpi, pop_nh_multi),na.rm=T) == 0 ~ 0),
         trt_entropy_er = per_asian*log_asian + 
                          per_black*log_black + 
                          per_hl*log_hl + 
                          per_white*log_white +
                          per_aian*log_aian +
                          per_other*log_other,
         trt_entropy_ers = (trt_entropy_er - min(trt_entropy_er))/(max(trt_entropy_er)-min(trt_entropy_er)),
         hd_er = ifelse(trt_entropy_ers >= 0.7414 &
                        per_asian < 0.45 & 
                        per_black < 0.45 & 
                        per_hl < 0.45 & 
                        per_white < 0.45 & 
                        per_aian < 0.45 &
                        per_other < 0.45, 1, 0),
         ld_er = ifelse(trt_entropy_ers <= 0.3707 |
                        per_asian > 0.8 |
                        per_black > 0.8 |
                        per_hl > 0.8 |
                        per_white > 0.8 |
                        per_aian > 0.8 |
                        per_other > 0.8, 1, 0),
         md_er = ifelse(hd_er == 0 & 
                        ld_er == 0, 1, 0)) %>%
  select(GEOID,
         CBSA,
         pop_total, 
         pop_nh,
         pop_nh_white,
         pop_nh_black,
         pop_nh_aian,
         pop_nh_asian,
         pop_nh_nhpi,
         pop_nh_other,
         pop_nh_multi,
         pop_hl,
         gq_per,
         hhs,
         per_fb,
         foreign_born,
         foreign_den,
         hhld_pov_rt,
         hhld_pov_num,
         hhld_pov_den,
         median_hhld_inc,
         starts_with("phi"),
         starts_with("as_phi"),
         starts_with("bl_phi"),
         starts_with("hl_phi"),
         starts_with("nhwh_phi"),
         starts_with("oth_phi"),
         starts_with("per_"),
         starts_with("log_"),
         trt_entropy_er,
         trt_entropy_ers,
         ends_with("_er"))

## check missings ##
paste("MISSINGS IN 2010 TRACT FILE")
sapply(metro.tracts.2010.int, function(x) sum(is.na(x)))

##
## determine majority racial group ##
##

metro.tracts.2010.int$maj_race <- c(colnames(metro.tracts.2010.int[,c("per_asian",
                                                                        "per_black",
                                                                        "per_hl",
                                                                        "per_white",
                                                                        "per_aian",
                                                                        "per_other")])[apply(metro.tracts.2010.int[,c("per_asian",
                                                                                                                      "per_black",
                                                                                                                      "per_hl",
                                                                                                                      "per_white",
                                                                                                                      "per_aian",
                                                                                                                      "per_other")],
                                                                                                                      1,which.max)])

## save file ##

#save(metro.tracts.2010.int,
#     file= paste(data_path, "/003_2010_trts_int.Rda", sep=""))

######################
## 2020 Census data ##
######################

load("001_2020_trts_cb.Rda")

## impute key missing median household inc ##
all.tracts.2020.cb$median_hhld_inc[all.tracts.2020.cb$GEOID == "06037204410"] <- 59712 * (437.6/410.8)
all.tracts.2020.cb$median_hhld_inc[all.tracts.2020.cb$GEOID == "06037207102"] <- 31071 * (437.6/410.8)
all.tracts.2020.cb$median_hhld_inc[all.tracts.2020.cb$GEOID == "06037599100"] <- 88845 # this is catalina island, imputed from the other tract on the island
all.tracts.2020.cb$median_hhld_inc[all.tracts.2020.cb$GEOID == "09003524502"] <- 39779 * (437.6/410.8)
all.tracts.2020.cb$median_hhld_inc[all.tracts.2020.cb$GEOID == "12011020211"] <- 39688 * (437.6/410.8)
all.tracts.2020.cb$median_hhld_inc[all.tracts.2020.cb$GEOID == "12086005406"] <- 32353 * (437.6/410.8)
all.tracts.2020.cb$median_hhld_inc[all.tracts.2020.cb$GEOID == "17031380100"] <- 23417 * (437.6/410.8)
all.tracts.2020.cb$median_hhld_inc[all.tracts.2020.cb$GEOID == "17031550100"] <- 57780 * (437.6/410.8)
all.tracts.2020.cb$median_hhld_inc[all.tracts.2020.cb$GEOID == "17031570200"] <- 47198 * (437.6/383.2)
all.tracts.2020.cb$median_hhld_inc[all.tracts.2020.cb$GEOID == "17031611800"] <- 47181 * (437.6/410.8)
all.tracts.2020.cb$median_hhld_inc[all.tracts.2020.cb$GEOID == "17031671900"] <- 59663 * (437.6/410.8)
all.tracts.2020.cb$median_hhld_inc[all.tracts.2020.cb$GEOID == "17031830006"] <- 55634 * (437.6/383.2)
all.tracts.2020.cb$median_hhld_inc[all.tracts.2020.cb$GEOID == "17031839500"] <- 25000 * (437.6/383.2)
all.tracts.2020.cb$median_hhld_inc[all.tracts.2020.cb$GEOID == "26163503900"] <- 30792 * (437.6/410.8)
all.tracts.2020.cb$median_hhld_inc[all.tracts.2020.cb$GEOID == "26163553200"] <- 16198 * (437.6/383.2)
all.tracts.2020.cb$median_hhld_inc[all.tracts.2020.cb$GEOID == "26163553800"] <- 28973 * (437.6/410.8)
all.tracts.2020.cb$median_hhld_inc[all.tracts.2020.cb$GEOID == "34027041705"] <- 44167 * (437.6/383.2)
all.tracts.2020.cb$median_hhld_inc[all.tracts.2020.cb$GEOID == "36005036700"] <- 27375 * (437.6/383.2)
all.tracts.2020.cb$median_hhld_inc[all.tracts.2020.cb$GEOID == "36047004600"] <- 122813 * (437.6/410.8)
all.tracts.2020.cb$median_hhld_inc[all.tracts.2020.cb$GEOID == "36047117400"] <- 61250 * (437.6/383.2)
all.tracts.2020.cb$median_hhld_inc[all.tracts.2020.cb$GEOID == "36081014800"] <- 60227 * (437.6/383.2)
all.tracts.2020.cb$median_hhld_inc[all.tracts.2020.cb$GEOID == "36081053601"] <- 101136 * (437.6/377.8)
all.tracts.2020.cb$median_hhld_inc[all.tracts.2020.cb$GEOID == "39035150400"] <- 36944 * (437.6/383.2)
all.tracts.2020.cb$median_hhld_inc[all.tracts.2020.cb$GEOID == "42101020500"] <- 17392 * (437.6/383.2)
all.tracts.2020.cb$median_hhld_inc[all.tracts.2020.cb$GEOID == "42101023900"] <- 41917 * (437.6/410.8)
all.tracts.2020.cb$median_hhld_inc[all.tracts.2020.cb$GEOID == "48135000300"] <- 31500 * (437.6/410.8)
all.tracts.2020.cb$median_hhld_inc[all.tracts.2020.cb$GEOID == "48141010505"] <- 27964 * (437.6/410.8)
all.tracts.2020.cb$median_hhld_inc[all.tracts.2020.cb$GEOID == "48215024302"] <- 28583 * (437.6/377.8)
all.tracts.2020.cb$median_hhld_inc[all.tracts.2020.cb$GEOID == "53015980000"] <- 30592 * (437.6/383.2)

## update GEOIDs to reflect 2010 tract boundaries ##

load("001_ltdb_20t10.Rda")

## pre-merge checks ##

paste("CLASS OF GEOID IN 2020 TRACT FILE")
class(all.tracts.2020.cb$GEOID)
paste("GEOID CHECK IN 2020 TRACT FILE")
range(nchar(trim(all.tracts.2020.cb$GEOID)))
paste("IS 2020 TRACT FILE UNIQUE BY GEOID")
nrow(all.tracts.2020.cb) == length(unique(all.tracts.2020.cb$GEOID))

paste("CLASS OF GEOID IN LTDB 20T10")
class(ltdb.20t10$GEOID)
paste("GEOID CHECK IN LTDB 20T10")
range(nchar(trim(ltdb.20t10$GEOID)))

## merge the files ## 

all.tracts.2020.match.m <- stata.merge(all.tracts.2020.cb,
                                       ltdb.20t10,
                                       "GEOID")

## diagnose merge ## 
paste("MERGE BETWEEN 2020 TRACT FILE and LTDB 20T10")
table(all.tracts.2020.match.m$merge.variable, useNA = "ifany")

## keep only the matches and obs with non-zero weights ##

all.tracts.2020.match <- all.tracts.2020.match.m %>%
  filter(merge.variable == 3 &
         wt != 0) %>%
  select(-merge.variable,
         -NAM,
         -state)

## change the GEOID ##
## some 2020 tract IDs are duplicated because of  ##
## tracts that changed from 2010 to 2020; these   ##
## tracts will be condensed into one observation  ##
## via a weighted average, with weights from LTDB ##

all.tracts.2020.pr1a <- all.tracts.2020.match %>%
  select(GEOID,
         GEOID10,
         GEOID20, 
         cfips,
         everything()) %>%
  select(-median_hhld_inc) %>%
  mutate_at(vars(pop_total:nhwh_hhld_inc16),
            .funs = funs(. * wt))

all.tracts.2020.pr2a <- all.tracts.2020.pr1a %>% 
  group_by(GEOID10) %>%
  summarize_at(vars(pop_total:nhwh_hhld_inc16),
               funs(sum(., na.rm=T))) %>%
  mutate(## use ACS variables for valid denominator ## 
         gq_per = gq_num/gq_den) %>%
  rename(GEOID = GEOID10)

## check ## 
paste("CHECK GQ PER IN 2020 TRACT FILE")
summary(all.tracts.2020.pr2a$gq_per[all.tracts.2020.pr2a$gq_den > 0])

##
## separate process for median hhld inc ## 
##

all.tracts.2020.pr2b <- all.tracts.2020.match %>%
  select(GEOID,
         GEOID10,
         GEOID20, 
         median_hhld_inc,
         wt) %>%
  group_by(GEOID10) %>%
  summarize_at(vars(median_hhld_inc),
               funs(weighted.mean(., wt, na.rm=T))) %>%
  rename(GEOID = GEOID10)

## check obs ## 
paste("OBS IN 2020 TRACT FILE A")
nrow(all.tracts.2020.pr2a)
paste("OBS IN 2020 TRACT FILE B")
nrow(all.tracts.2020.pr2b)

## merge files ##
all.tracts.2020.cbpf.m <-  stata.merge(all.tracts.2020.pr2a,
                                       all.tracts.2020.pr2b, 
                                       "GEOID") 

## check merge ## 
paste("MERGE BETWEEN 2020 TRACT FILES A AND B")
table(all.tracts.2020.cbpf.m$merge.variable, useNA = "ifany")

## keep matches ## 
all.tracts.2020.cbpf <- all.tracts.2020.cbpf.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable) 

## check obs ##
paste("OBS IN 2020 TRACT FILE AFTER A AND B MERGE")
nrow(all.tracts.2020.cbpf)

##
## output file for later social capital analyses #
##

save(all.tracts.2020.cbpf,
     file = "002_trts_fsc.Rda")

##
## add a denominator for calculating income groups for later on ##
## and implement sample restrictions ##
##

all.tracts.2020.cb <- all.tracts.2020.cbpf %>%
  mutate(hhs_inc = rowSums(cbind(aian_hhld_inc,
                                 as_hhld_inc,
                                 bl_hhld_inc,
                                 nhpi_hhld_inc,
                                 hl_hhld_inc,
                                 oth_hhld_inc,
                                 tw_hhld_inc,
                                 nhwh_hhld_inc),na.rm=T), 
         FIPS = substr(GEOID,1,5)) %>%
  filter(pop_total >= 500 & 
         hhs > 10 & 
         hhs_inc > 0 &
         gq_per <= 0.5)

##
## condense r0 separately ##
##

all.tracts.2020.r0 <- all.tracts.2020.match %>%
  filter(GEOID10 %in% all.tracts.2020.pr2a$GEOID) %>%
  select(GEOID,
         GEOID10,
         pop_total,
         r0) %>%
  mutate(r0_new = pop_total*r0)

## checks ## 
paste("OBS IN 2020 r0 FILE")
nrow(all.tracts.2020.r0)
paste("UNIQUE GEOIDS IN 2020 r0 FILE")
length(unique(all.tracts.2020.r0$GEOID10))

all.tracts.2020.r0.final <- all.tracts.2020.r0 %>%
  group_by(GEOID10) %>%
  summarize(num = sum(r0_new, na.rm=T),
            den = sum(pop_total, na.rm=T),
            r0 = num/den) %>%
  rename(GEOID = GEOID10) %>% 
  select(GEOID,
         r0)

## obs check ## 
paste("OBS IN FILE 2020 r0 FILE")
nrow(all.tracts.2020.r0.final)

##
## merge on correct r0 ##
##

all.tracts.2020.pr3.m <- stata.merge(all.tracts.2020.cb,
                                     all.tracts.2020.r0.final,
                                     "GEOID")

## check merge ## 
paste("MERGE BETWEEN 2020 TRACT FILE AND 2020 r0 FILE")
table(all.tracts.2020.pr3.m$merge.variable, useNA = "ifany")

## keep matches ## 
all.tracts.2020.pr3 <- all.tracts.2020.pr3.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable) %>%
  mutate(FIPS = substr(GEOID,1,5))

##
## now, limit census tracts to metro areas ## 
##

## pre-merge checks ## 
paste("CLASS OF FIPS IN 2020 TRACT FILE")
class(all.tracts.2020.pr3$FIPS)
paste("FIPS CHECK IN 2020 TRACT FILE")
range(nchar(trim(all.tracts.2020.pr3$FIPS)))
paste("IS FIPS TO MSA XWALK UNIQUE BY FIPS?")
nrow(all.tracts.2020.pr3) == length(unique(all.tracts.2020.pr3$FIPS))

paste("CLASS OF FIPS IN FIPS TO MSA XWALK")
class(fips.to.msas$FIPS)
paste("FIPS CHECK IN FIPS TO MSA XWALK")
range(nchar(trim(fips.to.msas$FIPS)))
paste("IS FIPS TO MSA XWALK UNIQUE BY FIPS?")
nrow(fips.to.msas) == length(unique(fips.to.msas$FIPS))

##
## merge files ## 
##

metro.tracts.2020.m <- stata.merge(all.tracts.2020.pr3,
                                   fips.to.msas,
                                   "FIPS")

## diagnose merge ##
## check: the non-matching obs from fips.to.msas (2) are all in Puerto Rico ##
paste("MERGE BETWEEN 2020 TRACT FILE AND FIPS TO MSA XWALK")
table(metro.tracts.2020.m$merge.variable, useNA = "ifany")

nm2.metro.2020 <- metro.tracts.2020.m %>%
  filter(merge.variable == 2)

paste("ARE ALL NON-MATCHING OBS IN PUERTO RICO?")
sum(substr(nm2.metro.2020$FIPS,1,2) == "72") == nrow(nm2.metro.2020)

## keep only metro tracts ## 
metro.tracts.2020 <- metro.tracts.2020.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable) 

##
## final processing ##
##

metro.tracts.2020.int <- metro.tracts.2020 %>%
  mutate(year = 2020, 
         per_fb = ifelse(foreign_den > 0, foreign_born/foreign_den, 0),
         hhld_pov_rt = ifelse(hhld_pov_den > 0, hhld_pov_num/hhld_pov_den, 0),
         phi1 = hhld_inc1/hhld_inc,
         phi2 = hhld_inc2/hhld_inc,
         phi3 = hhld_inc3/hhld_inc,
         phi4 = hhld_inc4/hhld_inc,
         phi5 = hhld_inc5/hhld_inc,
         phi6 = hhld_inc6/hhld_inc,
         phi7 = hhld_inc7/hhld_inc,
         phi8 = hhld_inc8/hhld_inc,
         phi9 = hhld_inc9/hhld_inc,
         phi10 = hhld_inc10/hhld_inc,
         phi11 = hhld_inc11/hhld_inc,
         phi12 = hhld_inc12/hhld_inc,
         phi13 = hhld_inc13/hhld_inc,
         phi14 = hhld_inc14/hhld_inc,
         phi15 = hhld_inc15/hhld_inc,
         phi16 = hhld_inc16/hhld_inc,
         as_phi1 = as_hhld_inc1/hhs_inc,
         as_phi2 = as_hhld_inc2/hhs_inc,
         as_phi3 = as_hhld_inc3/hhs_inc,
         as_phi4 = as_hhld_inc4/hhs_inc,
         as_phi5 = as_hhld_inc5/hhs_inc,
         as_phi6 = as_hhld_inc6/hhs_inc,
         as_phi7 = as_hhld_inc7/hhs_inc,
         as_phi8 = as_hhld_inc8/hhs_inc,
         as_phi9 = as_hhld_inc9/hhs_inc,
         as_phi10 = as_hhld_inc10/hhs_inc,
         as_phi11 = as_hhld_inc11/hhs_inc,
         as_phi12 = as_hhld_inc12/hhs_inc,
         as_phi13 = as_hhld_inc13/hhs_inc,
         as_phi14 = as_hhld_inc14/hhs_inc,
         as_phi15 = as_hhld_inc15/hhs_inc,
         as_phi16 = as_hhld_inc16/hhs_inc,
         bl_phi1 = bl_hhld_inc1/hhs_inc,
         bl_phi2 = bl_hhld_inc2/hhs_inc,
         bl_phi3 = bl_hhld_inc3/hhs_inc,
         bl_phi4 = bl_hhld_inc4/hhs_inc,
         bl_phi5 = bl_hhld_inc5/hhs_inc,
         bl_phi6 = bl_hhld_inc6/hhs_inc,
         bl_phi7 = bl_hhld_inc7/hhs_inc,
         bl_phi8 = bl_hhld_inc8/hhs_inc,
         bl_phi9 = bl_hhld_inc9/hhs_inc,
         bl_phi10 = bl_hhld_inc10/hhs_inc,
         bl_phi11 = bl_hhld_inc11/hhs_inc,
         bl_phi12 = bl_hhld_inc12/hhs_inc,
         bl_phi13 = bl_hhld_inc13/hhs_inc,
         bl_phi14 = bl_hhld_inc14/hhs_inc,
         bl_phi15 = bl_hhld_inc15/hhs_inc,
         bl_phi16 = bl_hhld_inc16/hhs_inc,
         hl_phi1 = hl_hhld_inc1/hhs_inc,
         hl_phi2 = hl_hhld_inc2/hhs_inc,
         hl_phi3 = hl_hhld_inc3/hhs_inc,
         hl_phi4 = hl_hhld_inc4/hhs_inc,
         hl_phi5 = hl_hhld_inc5/hhs_inc,
         hl_phi6 = hl_hhld_inc6/hhs_inc,
         hl_phi7 = hl_hhld_inc7/hhs_inc,
         hl_phi8 = hl_hhld_inc8/hhs_inc,
         hl_phi9 = hl_hhld_inc9/hhs_inc,
         hl_phi10 = hl_hhld_inc10/hhs_inc,
         hl_phi11 = hl_hhld_inc11/hhs_inc,
         hl_phi12 = hl_hhld_inc12/hhs_inc,
         hl_phi13 = hl_hhld_inc13/hhs_inc,
         hl_phi14 = hl_hhld_inc14/hhs_inc,
         hl_phi15 = hl_hhld_inc15/hhs_inc,
         hl_phi16 = hl_hhld_inc16/hhs_inc,
         nhwh_phi1 = nhwh_hhld_inc1/hhs_inc,
         nhwh_phi2 = nhwh_hhld_inc2/hhs_inc,
         nhwh_phi3 = nhwh_hhld_inc3/hhs_inc,
         nhwh_phi4 = nhwh_hhld_inc4/hhs_inc,
         nhwh_phi5 = nhwh_hhld_inc5/hhs_inc,
         nhwh_phi6 = nhwh_hhld_inc6/hhs_inc,
         nhwh_phi7 = nhwh_hhld_inc7/hhs_inc,
         nhwh_phi8 = nhwh_hhld_inc8/hhs_inc,
         nhwh_phi9 = nhwh_hhld_inc9/hhs_inc,
         nhwh_phi10 = nhwh_hhld_inc10/hhs_inc,
         nhwh_phi11 = nhwh_hhld_inc11/hhs_inc,
         nhwh_phi12 = nhwh_hhld_inc12/hhs_inc,
         nhwh_phi13 = nhwh_hhld_inc13/hhs_inc,
         nhwh_phi14 = nhwh_hhld_inc14/hhs_inc,
         nhwh_phi15 = nhwh_hhld_inc15/hhs_inc,
         nhwh_phi16 = nhwh_hhld_inc16/hhs_inc,
         oth_phi1 = rowSums(cbind(oth_hhld_inc1 , nhpi_hhld_inc1 , aian_hhld_inc1 , tw_hhld_inc1),na.rm=T)/hhs_inc,
         oth_phi2 = rowSums(cbind(oth_hhld_inc2 , nhpi_hhld_inc2 , aian_hhld_inc2 , tw_hhld_inc2),na.rm=T)/hhs_inc,
         oth_phi3 = rowSums(cbind(oth_hhld_inc3 , nhpi_hhld_inc3 , aian_hhld_inc3 , tw_hhld_inc3),na.rm=T)/hhs_inc,
         oth_phi4 = rowSums(cbind(oth_hhld_inc4 , nhpi_hhld_inc4 , aian_hhld_inc4 , tw_hhld_inc4),na.rm=T)/hhs_inc,
         oth_phi5 = rowSums(cbind(oth_hhld_inc5 , nhpi_hhld_inc5 , aian_hhld_inc5 , tw_hhld_inc5),na.rm=T)/hhs_inc,
         oth_phi6 = rowSums(cbind(oth_hhld_inc6 , nhpi_hhld_inc6 , aian_hhld_inc6 , tw_hhld_inc6),na.rm=T)/hhs_inc,
         oth_phi7 = rowSums(cbind(oth_hhld_inc7 , nhpi_hhld_inc7 , aian_hhld_inc7 , tw_hhld_inc7),na.rm=T)/hhs_inc,
         oth_phi8 = rowSums(cbind(oth_hhld_inc8 , nhpi_hhld_inc8 , aian_hhld_inc8 , tw_hhld_inc8),na.rm=T)/hhs_inc,
         oth_phi9 = rowSums(cbind(oth_hhld_inc9 , nhpi_hhld_inc9 , aian_hhld_inc9 , tw_hhld_inc9),na.rm=T)/hhs_inc,
         oth_phi10 = rowSums(cbind(oth_hhld_inc10 , nhpi_hhld_inc10 , aian_hhld_inc10 , tw_hhld_inc10),na.rm=T)/hhs_inc,
         oth_phi11 = rowSums(cbind(oth_hhld_inc11 , nhpi_hhld_inc11 , aian_hhld_inc11 , tw_hhld_inc11),na.rm=T)/hhs_inc,
         oth_phi12 = rowSums(cbind(oth_hhld_inc12 , nhpi_hhld_inc12 , aian_hhld_inc12 , tw_hhld_inc12),na.rm=T)/hhs_inc,
         oth_phi13 = rowSums(cbind(oth_hhld_inc13 , nhpi_hhld_inc13 , aian_hhld_inc13 , tw_hhld_inc13),na.rm=T)/hhs_inc,
         oth_phi14 = rowSums(cbind(oth_hhld_inc14 , nhpi_hhld_inc14 , aian_hhld_inc14 , tw_hhld_inc14),na.rm=T)/hhs_inc,
         oth_phi15 = rowSums(cbind(oth_hhld_inc15 , nhpi_hhld_inc15 , aian_hhld_inc15 , tw_hhld_inc15),na.rm=T)/hhs_inc,
         oth_phi16 = rowSums(cbind(oth_hhld_inc16 , nhpi_hhld_inc16 , aian_hhld_inc16 , tw_hhld_inc16),na.rm=T)/hhs_inc,
         per_asian = case_when(pop_total != 0 ~ pop_nh_asian/pop_total,
                               pop_total == 0 ~ 0),
         per_black = case_when(pop_total != 0 ~ pop_nh_black/pop_total,
                               pop_total == 0 ~ 0),
         per_hl = case_when(pop_total != 0 ~ pop_hl/pop_total,
                                pop_total == 0 ~ 0),
         per_white = case_when(pop_total != 0 ~ pop_nh_white/pop_total,
                               pop_total == 0 ~ 0),
         per_aian = case_when(pop_total != 0 ~ pop_nh_aian/pop_total, 
                              pop_total == 0 ~ 0),
         per_other = case_when(pop_total != 0 ~ rowSums(cbind(pop_nh_other ,pop_nh_nhpi ,pop_nh_multi),na.rm=T)/pop_total, 
                               pop_total == 0 ~ 0),
         log_asian = case_when(pop_nh_asian != 0 ~ log(1/per_asian),
                               pop_nh_asian == 0 ~ 0),
         log_black = case_when(pop_nh_black != 0 ~ log(1/per_black),
                               pop_nh_black == 0 ~ 0),
         log_hl = case_when(pop_hl != 0 ~ log(1/per_hl),
                                pop_hl == 0 ~ 0),
         log_white = case_when(pop_nh_white != 0 ~ log(1/per_white),
                               pop_nh_white == 0 ~ 0),
         log_aian = case_when(pop_nh_aian != 0 ~ log(1/per_aian),
                              pop_nh_aian == 0 ~ 0),
         log_other = case_when(rowSums(cbind(pop_nh_other, pop_nh_nhpi, pop_nh_multi),na.rm=T) != 0 ~ log(1/per_other),
                               rowSums(cbind(pop_nh_other, pop_nh_nhpi, pop_nh_multi),na.rm=T) == 0 ~ 0),
         trt_entropy_er = per_asian*log_asian + 
                          per_black*log_black + 
                          per_hl*log_hl + 
                          per_white*log_white +
                          per_aian*log_aian +
                          per_other*log_other,
         trt_entropy_ers = (trt_entropy_er - min(trt_entropy_er))/(max(trt_entropy_er)-min(trt_entropy_er)),
         hd_er = ifelse(trt_entropy_ers >= 0.7414 &
                        per_asian < 0.45 & 
                        per_black < 0.45 & 
                        per_hl < 0.45 & 
                        per_white < 0.45 & 
                        per_aian < 0.45 &
                        per_other < 0.45, 1, 0),
         ld_er = ifelse(trt_entropy_ers <= 0.3707 |
                        per_asian > 0.8 |
                        per_black > 0.8 |
                        per_hl > 0.8 |
                        per_white > 0.8 |
                        per_aian > 0.8 |
                        per_other > 0.8, 1, 0),
         md_er = ifelse(hd_er == 0 & 
                        ld_er == 0, 1, 0)) %>%
  select(GEOID,
         CBSA,
         pop_total, 
         pop_nh,
         pop_nh_white,
         pop_nh_black,
         pop_nh_aian,
         pop_nh_asian,
         pop_nh_nhpi,
         pop_nh_other,
         pop_nh_multi,
         pop_hl,
         gq_per,
         hhs,
         per_fb,
         foreign_born,
         foreign_den,
         hhld_pov_rt,
         hhld_pov_num,
         hhld_pov_den,
         median_hhld_inc,
         starts_with("phi"),
         starts_with("as_phi"),
         starts_with("bl_phi"),
         starts_with("hl_phi"),
         starts_with("nhwh_phi"),
         starts_with("oth_phi"),
         starts_with("per_"),
         starts_with("log_"),
         trt_entropy_er,
         trt_entropy_ers,
         ends_with("_er"))

## check missings ## 
paste("MISSINGS IN 2020 TRACT FILE")
sapply(metro.tracts.2020.int, function(x) sum(is.na(x)))

##
## determine majority racial group ##
##

metro.tracts.2020.int$maj_race <- c(colnames(metro.tracts.2020.int[,c("per_asian",
                                                                      "per_black",
                                                                      "per_hl",
                                                                      "per_white",
                                                                      "per_aian",
                                                                      "per_other")])[apply(metro.tracts.2020.int[,c("per_asian",
                                                                                                                    "per_black",
                                                                                                                    "per_hl",
                                                                                                                    "per_white",
                                                                                                                    "per_aian",
                                                                                                                    "per_other")],
                                                                                           1,which.max)])

## save file ##

#save(metro.tracts.2020.int,
#     file= paste(data_path, "/003_2020_trt_int.Rda", sep=""))

##########################
## create analytic file ##
##########################

## checks ##

paste("COLUMNS IN 2000 TRACT FILE")
ncol(metro.tracts.2000.int)
paste("COLUMNS IN 2010 TRACT FILE")
ncol(metro.tracts.2010.int)
paste("COLUMNS IN 2020 TRACT FILE")
ncol(metro.tracts.2020.int)

## wide file first ##

t1 <- metro.tracts.2000.int %>%
  rename_at(vars(pop_total:maj_race),function(x) paste0(x,"_2000"))

t2 <- metro.tracts.2010.int %>%
  rename_at(vars(pop_total:maj_race),function(x) paste0(x,"_2010"))

t3 <- metro.tracts.2020.int %>%
  rename_at(vars(pop_total:maj_race),function(x) paste0(x,"_2020"))

t1t2.m <- stata.merge(t1, t2,
                      c("GEOID","CBSA"))

## check first merge ##
paste("MERGE BETWEEN 2000 AND 2010 TRACT FILES")
table(t1t2.m$merge.variable, useNA = "ifany")

##
## figure out mismatches ##
##

## 2010 data for valid tracts in 2000  ## 

miss.2000 <- t1t2.m %>%
  filter(merge.variable == 1) 

miss.2000.data <- all.tracts.2010.temp %>%
  filter(GEOID %in% miss.2000$GEOID)

#sum(miss.2000.data$pop_total < 500, na.rm=T)
#sum(miss.2000.data$hhs <= 10, na.rm=T)
#sum(miss.2000.data$hhs_inc == 0, na.rm=T)
#sum(miss.2000.data$gq_per > 0.5, na.rm=T)

paste("ARE NON-MATCHING OBS FROM 2000 DUE TO SAMPLE RESTRICTIONS?")
sum(miss.2000.data$pop_total < 500 | miss.2000.data$hhs <= 10 | miss.2000.data$hhs_inc == 0 | miss.2000.data$gq_per > 0.5) == nrow(miss.2000.data)

## 2000 data for valid tracts in 2010 ## 

miss.2010 <- t1t2.m %>%
  filter(merge.variable == 2) 

## 43 obs are weighted 0 in the LTDB and 1 obs is not in the LTDB ##
## this explains the discrepancy between the 674 obs in miss.2010 and 630 in miss.2010.data ##

miss.2010.data <- all.tracts.2000.rf %>%
  filter(GEOID %in% miss.2010$GEOID)

mystery.tracts <- miss.2010 %>%
  filter(GEOID %notin% miss.2010.data$GEOID)

#sum(miss.2010.data$pop_total < 500, na.rm=T)
#sum(miss.2010.data$hhs <= 10, na.rm=T)
#sum(miss.2010.data$hhs_inc == 0, na.rm=T)
#sum(miss.2010.data$gq_per > 0.5, na.rm=T)

paste("ARE NON-MATCHING OBS FROM 2010 DUE TO SAMPLE RESTRICTIONS?")
sum(miss.2010.data$pop_total < 500 | miss.2010.data$hhs <= 10 | miss.2010.data$hhs_inc == 0 | miss.2010.data$gq_per > 0.5) == nrow(miss.2010.data)

## keep matches ##
t1t2 <- t1t2.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable)

af.wide.m <- stata.merge(t1t2,
                         t3,
                         c("GEOID","CBSA"))

## check merge ##
paste("MERGE BETWEEN MATCHED 2000-2010 TRACTS AND 2020 TRACT FILE")
table(af.wide.m$merge.variable, useNA = "ifany")

##
## figure out mismatches ##
##

## 2020 data for valid tracts in 2010 ## 

miss.2010.t2 <- af.wide.m %>%
  filter(merge.variable == 1) 

miss.2010.t2.data <- all.tracts.2020.cbpf %>%
  filter(GEOID %in% miss.2010.t2$GEOID)

## 103 of these missings are in the LTDB ## 

check1 <- all.tracts.2020.cbpf %>%
  filter(GEOID %in% miss.2010.t2$GEOID & 
         GEOID %in% ltdb.20t10$GEOID10)

## these 103 missings are DQed in the 2020 data ## 

check2 <- check1 %>%
  filter(pop_total < 500 | 
         hhs <= 10 |
         hhld_inc == 0 |
         gq_per > 0.5)

## the remaining 20 are not in the 2020 data at all ## 

check3 <- all.tracts.2020.cbpf %>%
  filter(GEOID %in% miss.2010.t2$GEOID)

paste("ARE THE NON-MATCHING OBS FROM 2000-2010 DUE TO SAMPLE RESTRICTIONS?")
nrow(check2) == nrow(miss.2010.t2.data)

## 2000 and 2010 data for valid tracts in 2020 ## 

miss.2020 <- af.wide.m %>%
  filter(merge.variable == 2) 

## invalid in 2000 ##
miss.2020.data.A <- all.tracts.2000.pr2a %>%
  filter(GEOID %in% miss.2020$GEOID)

## invalid in 2010 ##
miss.2020.data.B <- all.tracts.2010.temp %>%
  filter(GEOID %in% miss.2020$GEOID)

## 43 of the 0 weighted tracts are accounted for in the missing tracts from 2000-2010 to 2020 ##
## plus the 20 obs in 2020 that aren't in the crosswalk ##

missA <- miss.2020.data.A %>%
  filter(pop_total < 500 |
         hhs <= 10 | 
         hhs_inc == 0 | 
         gq_per > 0.5) 

missB <- miss.2020.data.B %>%
  filter((pop_total < 500 | 
         hhs <= 10 | 
         hhs_inc == 0 | 
         gq_per > 0.5) & 
         GEOID %notin% missA$GEOID)

paste("ARE THE NON-MISSING OBS FROM 2020 TRACT FILE ACCOUNTED FOR?")
nrow(missA) + nrow(missB) + 43 + 20 == nrow(miss.2020)

##
## all mismatches accounted for ##
## keep matches ##
## create segregation indices ##
##

af.wide <- af.wide.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable) 
  
##
## send data off to create rank H ##
##

## export 2000 income bins to calculate rank order H ##
msa.income.2000 <- metro.tracts.2000 %>%
  filter(GEOID %in% af.wide$GEOID) %>%
  select(GEOID, 
         CBSA,
         hhld_inc1,
         hhld_inc2,
         hhld_inc3,
         hhld_inc4,
         hhld_inc5,
         hhld_inc6,
         hhld_inc7,
         hhld_inc8,
         hhld_inc9,
         hhld_inc10,
         hhld_inc11,
         hhld_inc12,
         hhld_inc13,
         hhld_inc14,
         hhld_inc15,
         hhld_inc16,
         hhs,
         r0) %>%
  mutate(tract = as.numeric(GEOID),
         metro = as.numeric(CBSA),
         tot = round(hhs,0)) %>%
  select(tract,
         metro,
         everything(),
         -GEOID,
         -CBSA,
         -hhs) %>%
  rename_with(~str_remove(., 'hhld_'))

#summary(msa.income.2000)

## output file ## 
#write.dta(msa.income.2000,
#          file= paste(data_path, "/003_msa_income_2000_new.dta", sep=""))

## export 2010 income bins to calculate rank order H ##
msa.income.2010 <- metro.tracts.2010 %>%
  filter(GEOID %in% af.wide$GEOID) %>%
  select(GEOID, 
         CBSA,
         hhld_inc1,
         hhld_inc2,
         hhld_inc3,
         hhld_inc4,
         hhld_inc5,
         hhld_inc6,
         hhld_inc7,
         hhld_inc8,
         hhld_inc9,
         hhld_inc10,
         hhld_inc11,
         hhld_inc12,
         hhld_inc13,
         hhld_inc14,
         hhld_inc15,
         hhld_inc16,
         hhs,
         r0) %>%
  mutate(tract = as.numeric(GEOID),
         metro = as.numeric(CBSA),
         tot = round(hhs,0)) %>%
  select(tract,
         metro,
         everything(),
         -GEOID,
         -CBSA,
         -hhs) %>%
  rename_with(~str_remove(., 'hhld_'))

#summary(msa.income.2010)

## output file ## 
#write.dta(msa.income.2010,
#          file= paste(data_path, "/003_msa_income_2010_new.dta", sep=""))

## export 2020 income bins to calculate rank order H ##
msa.income.2020 <- metro.tracts.2020 %>%
  filter(GEOID %in% af.wide$GEOID) %>%
  select(GEOID, 
         CBSA,
         hhld_inc1,
         hhld_inc2,
         hhld_inc3,
         hhld_inc4,
         hhld_inc5,
         hhld_inc6,
         hhld_inc7,
         hhld_inc8,
         hhld_inc9,
         hhld_inc10,
         hhld_inc11,
         hhld_inc12,
         hhld_inc13,
         hhld_inc14,
         hhld_inc15,
         hhld_inc16,
         hhs,
         r0) %>%
  mutate(tract = as.numeric(GEOID),
         metro = as.numeric(CBSA),
         tot = round(hhs,0)) %>%
  select(tract,
         metro,
         everything(),
         -GEOID,
         -CBSA,
         -hhs) %>%
  rename_with(~str_remove(., 'hhld_'))

#summary(msa.income.2020)

## output file ## 
#write.dta(msa.income.2020,
#          file= paste(data_path, "/003_msa_income_2022_new.dta", sep=""))

######################
## create MSA files ##
######################

##########
## 2000 ## 
##########

load("001_2000_counties_cb.Rda")
all.counties.2000.cb$FIPS <- all.counties.2000.cb$GEOID

county.t.msa <- census.del.2020.final %>%
  select(FIPS,
         `CBSA Code`,
         `CBSA Title`) %>%
  rename(CBSA = `CBSA Code`,,
         cbsa_name = `CBSA Title`)

##
## bring on CBSA codes 
##

all.counties.2000.allvars.m <- stata.merge(all.counties.2000.cb,
                                           county.t.msa, 
                                           "FIPS")

## check merge ##
paste("MERGE BETWEEN 2000 COUNTIES AND COUNTY TO MSA XWALK")
table(all.counties.2000.allvars.m$merge.variable, useNA = "ifany")

## keep matches ##
all.counties.2000.allvars <- all.counties.2000.allvars.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable)

## start with unweighted vars ##
agg.msa.2000.nowt <- all.counties.2000.allvars %>%
  select(CBSA,
         pop_total,
         pop_nh,
         pop_nh_white,
         pop_nh_black,
         pop_nh_aian,
         pop_nh_asian,
         pop_nh_nhpi,
         pop_nh_other,
         pop_nh_multi,
         pop_hl) %>%
  group_by(CBSA) %>%
  summarize_at(vars(pop_total:pop_hl), sum, na.rm=T)

## now, weighted vars ##
agg.msa.2000.wt <- all.counties.2000.allvars %>%
  select(CBSA,
         pop_total,
         median_hhld_inc) %>%
  group_by(CBSA) %>%
  mutate(pop_total_msa = sum(pop_total, na.rm=T),
         msa_wt = pop_total/pop_total_msa) %>%
  summarize_at(vars(median_hhld_inc),
               funs(weighted.mean(., msa_wt, na.rm=T)))

## merge ##
agg.msa.2000.m <- stata.merge(agg.msa.2000.wt,
                              agg.msa.2000.nowt,
                              "CBSA")

## check merge ##
paste("MERGE BETWEEN 2000 MSA FILES A AND B")
table(agg.msa.2000.m$merge.variable, useNA = "ifany")

## final file ##
agg.msa.2000 <- agg.msa.2000.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable) %>%
  mutate(per_asian_msa = case_when(pop_total != 0 ~ pop_nh_asian/pop_total,
                                   pop_total == 0 ~ 0),
         per_black_msa = case_when(pop_total != 0 ~ pop_nh_black/pop_total,
                                   pop_total == 0 ~ 0),
         per_hl_msa = case_when(pop_total != 0 ~ pop_hl/pop_total,
                                    pop_total == 0 ~ 0),
         per_white_msa = case_when(pop_total != 0 ~ pop_nh_white/pop_total,
                                   pop_total == 0 ~ 0),
         per_aian_msa = case_when(pop_total != 0 ~ pop_nh_aian/pop_total, 
                                  pop_total == 0 ~ 0),
         per_other_msa = case_when(pop_total != 0 ~ rowSums(cbind(pop_nh_other,pop_nh_nhpi,pop_nh_multi),na.rm=T)/pop_total, 
                                   pop_total == 0 ~ 0),
         log_asian_msa = case_when(pop_nh_asian != 0 ~ log(1/per_asian_msa),
                                   pop_nh_asian == 0 ~ 0),
         log_black_msa = case_when(pop_nh_black != 0 ~ log(1/per_black_msa),
                                   pop_nh_black == 0 ~ 0),
         log_hl_msa = case_when(pop_hl != 0 ~ log(1/per_hl_msa),
                                    pop_hl == 0 ~ 0),
         log_white_msa = case_when(pop_nh_white != 0 ~ log(1/per_white_msa),
                                   pop_nh_white == 0 ~ 0),
         log_aian_msa = case_when(pop_nh_aian != 0 ~ log(1/per_aian_msa),
                                  pop_nh_aian == 0 ~ 0),
         log_other_msa = case_when(rowSums(cbind(pop_nh_other,pop_nh_nhpi,pop_nh_multi),na.rm=T) != 0 ~ log(1/per_other_msa),
                                   rowSums(cbind(pop_nh_other,pop_nh_nhpi,pop_nh_multi),na.rm=T) == 0 ~ 0),
         msa_entropy_er = per_asian_msa*log_asian_msa + 
                          per_black_msa*log_black_msa + 
                          per_hl_msa*log_hl_msa + 
                          per_white_msa*log_white_msa +
                          per_aian_msa*log_aian_msa +
                          per_other_msa*log_other_msa,
         msa_entropy_ers = (msa_entropy_er - min(msa_entropy_er))/(max(msa_entropy_er)-min(msa_entropy_er))) %>%
  rename(pop_total_msa = pop_total,
         median_hhld_inc_msa = median_hhld_inc) %>%
  select(CBSA,
         pop_total_msa,
         per_asian_msa,
         per_black_msa,
         per_hl_msa,
         per_white_msa,
         per_aian_msa,
         per_other_msa,
         msa_entropy_er,
         msa_entropy_ers,
         median_hhld_inc_msa)

## now, merge this info back onto tracts ## 

metro.tracts.2000.fm <- af.wide %>%
  select(GEOID,
         CBSA,
         ends_with("_2000")) %>%
  rename_at(.vars = vars(ends_with("_2000")),
            .funs = funs(sub("[_]2000$", "", .)))

metro.tracts.2000.wmsa.m <- stata.merge(metro.tracts.2000.fm,
                                        agg.msa.2000,
                                        "CBSA")

## check merge ## 
paste("MERGE BETWEEN 2000 TRACT AND MSA FILES")
table(metro.tracts.2000.wmsa.m$merge.variable, useNA = "ifany")

## keep matches ##
metro.tracts.2000.wmsa <- metro.tracts.2000.wmsa.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable)

##
## bring on rank H ##
##

rankH.2000 <- read_dta("acs_rank_H_2000.dta")

rankH.2000.pr <- rankH.2000 %>%
  group_by(metro) %>%
  summarize(h4_adj = mean(h4_adj, na.rm=T)) %>%
  mutate(CBSA = as.character(metro)) %>%
  select(CBSA,
        h4_adj) %>%
  rename(msa_H4_adj = h4_adj) 

## merge ## 
metro.tracts.2000.wh4.m <- stata.merge(metro.tracts.2000.wmsa,
                                       rankH.2000.pr,
                                       "CBSA")

## check merge ## 
paste("MERGE BETWEEN 2000 TRACT FILE AND 2000 RANK H FILE")
table(metro.tracts.2000.wh4.m$merge.variable, useNA = "ifany")

## keep matches ##
metro.tracts.2000.wh4 <- metro.tracts.2000.wh4.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable) 

##
## calculate msa segregation (mutual information)
##

metro.tracts.2000.D <- metro.tracts.2000.wh4 %>%
  mutate(D_pre_comp_aian = case_when(per_aian !=0 & per_aian_msa != 0 ~ log(per_aian/per_aian_msa),
                                     per_aian == 0 | per_aian_msa == 0 ~ 0),
         D_pre_comp_asian = case_when(per_asian !=0 &  per_asian_msa != 0 ~ log(per_asian/per_asian_msa),
                                      per_asian == 0 | per_asian_msa == 0 ~ 0),
         D_pre_comp_black = case_when(per_black !=0 & per_black_msa != 0 ~ log(per_black/per_black_msa),
                                      per_black == 0 | per_black_msa == 0 ~ 0),
         D_pre_comp_hl = case_when(per_hl !=0 &  per_hl_msa != 0 ~ log(per_hl/per_hl_msa),
                                       per_hl == 0 | per_hl_msa == 0 ~ 0),
         D_pre_comp_other = case_when(per_other !=0 & per_other_msa != 0 ~ log(per_other/per_other_msa),
                                      per_other == 0 | per_other_msa == 0 ~ 0),
         D_pre_comp_white = case_when(per_white !=0 &  per_white_msa != 0 ~ log(per_white/per_white_msa),
                                      per_white == 0 | per_white_msa == 0 ~ 0),
         D_comp_aian = per_aian * D_pre_comp_aian,
         D_comp_asian = per_asian * D_pre_comp_asian,
         D_comp_black = per_black * D_pre_comp_black,
         D_comp_hl = per_hl * D_pre_comp_hl,
         D_comp_other = per_other * D_pre_comp_other,
         D_comp_white = per_white * D_pre_comp_white,
         D_comp_all = rowSums(cbind(D_comp_aian,D_comp_asian,D_comp_black,D_comp_hl,D_comp_other,D_comp_white),na.rm=T),
         D_comp_final = (pop_total/pop_total_msa) * D_comp_all) %>%
  group_by(CBSA) %>%
  mutate(tracts_calc = n(),
         msa_D = sum(D_comp_final,na.rm=T)) %>%
  ungroup()

metro.tracts.2000.D$D_int <- ifelse(metro.tracts.2000.D$D_comp_all < median(metro.tracts.2000.D$D_comp_all,na.rm=T) + 
                                                                     sd(metro.tracts.2000.D$D_comp_all,na.rm=T), 1, 0) 
paste("CHECK 2000 MUTUAL INFO")
table(metro.tracts.2000.D$D_int, useNA = "ifany")

##
## calculate entropy by SES ##
##

## prep data ##
metro.tracts.2000.incl <- metro.tracts.2000.D %>%
  mutate(vlil = 0.5*median_hhld_inc_msa,
         lil = 0.8*median_hhld_inc_msa,
         mil = median_hhld_inc_msa,
         hmil = 1.2*median_hhld_inc_msa,
         hil = 1.5*median_hhld_inc_msa)

## keep only necessary vars ##
ses.2000 <- metro.tracts.2000.incl %>%
  select(GEOID,
         starts_with("phi"),
         vlil,
         lil,
         mil,
         hmil,
         hil)

## check for missing variables ##
paste("SUMMARY OF SES 2000 FILE")
summary(ses.2000)
paste("MISSINGS IN 2000 SES FILE")
sapply(ses.2000, function(x) sum(is.na(x)))

## input data into function that calculates distribution by HUD income limits ##
pp.ses.2000 <- as.data.frame(lapply(ses.2000[,18:22], hud_group, indata=ses.2000))

## input resulting data into cleaning function ##
pp.ses.2000.fin <- rd.data(pp.ses.2000)

## checks ## 
t.ses.2000 <- pp.ses.2000.fin %>%
  mutate(sum = rowSums(across(where(is.numeric))))

paste("CHECK TO SEE IF 2000 SES % WERE CREATED CORRECTLY")
summary(t.ses.2000$sum)

## calculate SES entropy ##
ses.entropy.2000 <- pp.ses.2000.fin %>%
  mutate_if(is.numeric, funs(ifelse(.>0, .*log(1/.),0))) %>%
  mutate(trt_entropy_ses = rowSums(across(where(is.numeric)))) %>%
  select(GEOID,
         trt_entropy_ses)

## check ##
paste("2000 SES ENTROPY")
summary(ses.entropy.2000$trt_entropy_ses)

## merge SES entropy onto working data ##
ses.entropy.final.2000.m <- stata.merge(pp.ses.2000.fin,
                                        ses.entropy.2000,
                                        "GEOID")

## check merge ## 
paste("MERGE BETWEEN 2000 SES FILE AND ENTROPY OUTPUT")
table(ses.entropy.final.2000.m$merge.variable, useNA = "ifany")

## keep matches ##
ses.entropy.final.2000 <- ses.entropy.final.2000.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable)

##
## determine majority/plurality SES group ##
##

ses.entropy.final.2000$maj_ses <- c(colnames(ses.entropy.final.2000[,c("linc",
                                                                       "minc",
                                                                       "hinc")])[apply(ses.entropy.final.2000[,c("linc",
                                                                                                                 "minc",
                                                                                                                 "hinc")],
                                                                                      1,which.max)])


##
## now, create entropy by SES and joint entropy ##
##

pas.joint.2000 <- metro.tracts.2000.incl %>%
  select(GEOID,
         starts_with("as_phi"),
         vlil,
         lil,
         mil,
         hmil,
         hil) %>%
  rename_all(~str_replace(.,"^as_",""))

pbl.joint.2000 <- metro.tracts.2000.incl %>%
  select(GEOID,
         starts_with("bl_phi"),
         vlil,
         lil,
         mil,
         hmil,
         hil) %>%
  rename_all(~str_replace(.,"^bl_",""))

phl.joint.2000 <- metro.tracts.2000.incl %>%
  select(GEOID,
         starts_with("hl_phi"),
         vlil,
         lil,
         mil,
         hmil,
         hil) %>%
  rename_all(~str_replace(.,"^hl_",""))

pwh.joint.2000 <- metro.tracts.2000.incl %>%
  select(GEOID,
         starts_with("nhwh_phi"),
         vlil,
         lil,
         mil,
         hmil,
         hil) %>%
  rename_all(~str_replace(.,"^nhwh_",""))

poth.joint.2000 <- metro.tracts.2000.incl %>%
  select(GEOID,
         starts_with("oth_phi"),
         vlil,
         lil,
         mil,
         hmil,
         hil) %>%
  rename_all(~str_replace(.,"^oth_",""))

aspp.joint.2000 <- as.data.frame(lapply(pas.joint.2000[,18:22], hud_group, indata=pas.joint.2000))
blpp.joint.2000 <- as.data.frame(lapply(pbl.joint.2000[,18:22], hud_group, indata=pbl.joint.2000))
hlpp.joint.2000 <- as.data.frame(lapply(phl.joint.2000[,18:22], hud_group, indata=phl.joint.2000))
whpp.joint.2000 <- as.data.frame(lapply(pwh.joint.2000[,18:22], hud_group, indata=pwh.joint.2000))
othpp.joint.2000 <- as.data.frame(lapply(poth.joint.2000[,18:22], hud_group, indata=poth.joint.2000))

## 
## collect the output
##
 
joint.ent.dfs.2000 <- list(aspp.joint.2000,
                           blpp.joint.2000,
                           hlpp.joint.2000,
                           whpp.joint.2000,
                           othpp.joint.2000)

## process ##
joint.ent.dfs.res.2000 <- lapply(joint.ent.dfs.2000, rd.data)

as.fin.joint.2000 <- joint.ent.dfs.res.2000[[1]] %>%
  rename_with(~paste0(., "_as"), linc:hinc)

bl.fin.joint.2000 <- joint.ent.dfs.res.2000[[2]] %>%
  rename_with(~paste0(., "_bl"), linc:hinc)

hl.fin.joint.2000 <- joint.ent.dfs.res.2000[[3]] %>%
  rename_with(~paste0(., "_lx"), linc:hinc)

wh.fin.joint.2000 <- joint.ent.dfs.res.2000[[4]] %>%
  rename_with(~paste0(., "_wh"), linc:hinc)

oth.fin.joint.2000 <- joint.ent.dfs.res.2000[[5]] %>%
  rename_with(~paste0(., "_oth"), linc:hinc)

## combine ##
all.joint.data.2000 <- list(as.fin.joint.2000,
                            bl.fin.joint.2000,
                            hl.fin.joint.2000,
                            wh.fin.joint.2000,
                            oth.fin.joint.2000)

all.joint.2000 <- all.joint.data.2000 %>%
  reduce(full_join, by="GEOID")

## test to see if columns add to 1 ## 
t.joint.2000 <- all.joint.2000 %>%
  mutate(sum = rowSums(across(where(is.numeric))))

paste("CHECK TO SEE IF JT SHARES WERE CREATED CORRECTLY")
summary(t.joint.2000$sum)

##
## create joint entropy measure ## 
##

all.joint.final.2000 <- all.joint.2000 %>%
  mutate_if(is.numeric, funs(ifelse(.>0, .*log(1/.),0))) %>%
  mutate(trt_entropy_joint = rowSums(across(where(is.numeric)))) %>%
  select(GEOID,
         trt_entropy_joint)

paste("JT ENTROPY MEASURE")
summary(all.joint.final.2000$trt_entropy_joint)

##
## add SES and joint entropy measures ##
##

metro.tracts.2000.sesvars.m <- stata.merge(metro.tracts.2000.D,
                                           ses.entropy.final.2000,
                                           "GEOID")

## check merge ##
paste("MERGE BETWEEN 2000 TRACT FILE AND SES ENTROPY FILE")
table(metro.tracts.2000.sesvars.m$merge.variable, useNA = "ifany")

## keep matches ##
metro.tracts.2000.sesvars <- metro.tracts.2000.sesvars.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable) 

## second merge ##
metro.tracts.2000.jtvars.m <- stata.merge(metro.tracts.2000.sesvars,
                                          all.joint.final.2000,
                                          "GEOID")

## check merge ## 
paste("MERGE BETWEEN 2000 TRACT FILE AND JT ENTROPY FILE")
table(metro.tracts.2000.jtvars.m$merge.variable, useNA = "ifany")


## keep matches, final cleaning ##
metro.tracts.2000.final <- metro.tracts.2000.jtvars.m %>%
  filter(merge.variable == 3) %>%
  mutate(year = 2000) %>%
  select(GEOID,
         CBSA,
         year,
         pop_total,
         pop_nh_aian, 
         pop_nh_asian,
         pop_nh_black,
         pop_hl,
         pop_nh_multi,
         pop_nh_nhpi,
         pop_nh_other,
         pop_nh_white,
         per_aian,
         per_asian,
         per_black,
         per_hl,
         per_other,
         per_white,
         per_fb,
         hhs,
         gq_per,
         median_hhld_inc,
         hhld_pov_rt,
         maj_race,
         maj_ses,
         hd_er,
         md_er,
         ld_er,
         tracts_calc,
         D_comp_all,
         D_comp_final,
         D_int,
         msa_D,
         msa_H4_adj,
         trt_entropy_er,
         trt_entropy_ers,
         linc,
         minc,
         hinc,
         trt_entropy_ses,
         trt_entropy_joint,
         pop_total_msa,
         per_aian_msa,
         per_asian_msa,
         per_black_msa,
         per_hl_msa,
         per_other_msa,
         per_white_msa,
         msa_entropy_er,
         msa_entropy_ers,
         median_hhld_inc_msa) %>%
  mutate(trt_entropy_sess = (trt_entropy_ses - min(trt_entropy_ses))/(max(trt_entropy_ses)-min(trt_entropy_ses)),
         trt_entropy_joints = (trt_entropy_joint - min(trt_entropy_joint))/(max(trt_entropy_joint)-min(trt_entropy_joint)),
         mhi_ratio = median_hhld_inc/median_hhld_inc_msa,
         hd_ses = ifelse(linc < 0.4 & 
                           minc < 0.4 & 
                           hinc < 0.4, 1, 0),
         ld_ses = ifelse(linc > (2/3) | 
                           minc > (2/3) | 
                           hinc > (2/3),1,0),
         md_ses = ifelse((linc >= 0.4 |
                            minc >= 0.4 | 
                            hinc >= 0.4) & 
                           (linc <= (2/3) & 
                            minc <= (2/3) & 
                            hinc <= (2/3)), 1, 0),
         hd_jt = ifelse(hd_er ==1 & hd_ses ==1, 1, 0), 
         ld_jt = ifelse(ld_er==1 | ld_ses == 1, 1, 0), 
         md_jt = ifelse(hd_jt==0 & ld_jt == 0, 1, 0),
         div_er = ifelse(hd_er == 1 | md_er == 1, 1, 0),
         int_er_abs = ifelse(md_er == 1 | hd_er == 1, 1, 0),
         int_er_rel = ifelse((md_er == 1 | hd_er == 1) & D_int == 1, 1, 0),
         int_er_relf = ifelse((md_er == 1 | hd_er == 1) & D_int == 1 & 
                               rowSums(cbind(per_black,per_hl),na.rm=T)>=0.2, 1, 0),
         int_ses = ifelse(hd_ses == 1 | md_ses == 1, 1, 0),
         #hhld_pov_rt < 0.4 & vli <= 0.8, 1, 0
         int_ses_rel_a = ifelse(int_ses == 1 & 
                                  mhi_ratio >= 0.5 & 
                                  mhi_ratio <= 2 & 
                                  hhld_pov_rt < 0.4, 1, 0),
         int_ses_rel_b = ifelse(int_ses == 1 & 
                                  mhi_ratio >= 0.5 & 
                                  mhi_ratio <= 2 & 
                                  hhld_pov_rt < 0.2, 1, 0),
         int_ses_rel_c = ifelse(int_ses == 1 & 
                                mhi_ratio >= 0.67 & 
                                mhi_ratio <= 1.5 & 
                                hhld_pov_rt < 0.4, 1, 0),
         int_ses_rel_d = ifelse(int_ses == 1 & 
                                mhi_ratio >= 0.67 & 
                                mhi_ratio <= 1.5 & 
                                hhld_pov_rt < 0.2, 1, 0),
         int_jt_abs = ifelse(int_er_abs == 1 & int_ses == 1, 1, 0),
         int_jt_rel_a = ifelse(int_er_rel == 1 & int_ses_rel_a == 1, 1, 0),
         int_jt_relf_a = ifelse(int_er_relf == 1 & int_ses_rel_a == 1, 1, 0),
         int_jt_rel_b = ifelse(int_er_rel == 1 & int_ses_rel_b == 1, 1, 0),
         int_jt_relf_b = ifelse(int_er_relf == 1 & int_ses_rel_b == 1, 1, 0),
         int_jt_rel_c = ifelse(int_er_rel == 1 & int_ses_rel_c == 1, 1, 0),
         int_jt_relf_c = ifelse(int_er_relf == 1 & int_ses_rel_c == 1, 1, 0),
         int_jt_rel_d = ifelse(int_er_rel == 1 & int_ses_rel_d == 1, 1, 0),
         int_jt_relf_d = ifelse(int_er_relf == 1 & int_ses_rel_d == 1, 1, 0),
         ## adjust $ vars for inflation (All Items CPI-U-RS Annual Averages) ##
         median_hhld_inc = median_hhld_inc * 437.6/246.8,
         median_hhld_inc_msa = median_hhld_inc_msa * 437.6/246.8)

## analysis ## 

paste("2000 MUTUAL INFO")
summary(metro.tracts.2000.final$msa_D)
paste("2000 RANK H")
summary(metro.tracts.2000.final$msa_H4_adj)
paste("2000 TRACTS CALC")
summary(metro.tracts.2000.final$tracts_calc)

paste("2000 LOW ER DIVERSITY BREAKDOWN")
prop.table(table(metro.tracts.2000.final$ld_er, useNA = "ifany"))
paste("2000 MODERATE ER DIVERSITY BREAKDOWN")
prop.table(table(metro.tracts.2000.final$md_er, useNA = "ifany"))
paste("2000 HIGH ER DIVERSITY BREAKDOWN")
prop.table(table(metro.tracts.2000.final$hd_er, useNA = "ifany"))
paste("2000 ER DIVERSITY BREAKDOWN")
prop.table(table(metro.tracts.2000.final$div_er, useNA = "ifany"))

paste("2000 INT ER ABS BREAKDOWN")
prop.table(table(metro.tracts.2000.final$int_er_abs, useNA = "ifany"))
paste("2000 INT ER REL BREAKDOWN")
prop.table(table(metro.tracts.2000.final$int_er_rel, useNA = "ifany"))
paste("2000 INT ER RELF BREAKDOWN")
prop.table(table(metro.tracts.2000.final$int_er_relf, useNA = "ifany"))

paste("2000 HIGH SES DIVERSITY")
prop.table(table(metro.tracts.2000.final$hd_ses, useNA = "ifany"))
paste("2000 MODERATE SES DIVERSITY")
prop.table(table(metro.tracts.2000.final$md_ses, useNA = "ifany"))
paste("2000 LOW SES DIVERSITY")
prop.table(table(metro.tracts.2000.final$ld_ses, useNA = "ifany"))

paste("2000 INT SES BREAKDOWN")
prop.table(table(metro.tracts.2000.final$int_ses, useNA = "ifany"))
paste("2000 INT SES REL A BREAKDOWN")
prop.table(table(metro.tracts.2000.final$int_ses_rel_a, useNA = "ifany"))
paste("2000 INT SES REL B BREAKDOWN")
prop.table(table(metro.tracts.2000.final$int_ses_rel_b, useNA = "ifany"))
paste("2000 INT SES REL C BREAKDOWN")
prop.table(table(metro.tracts.2000.final$int_ses_rel_c, useNA = "ifany"))
paste("2000 INT SES REL D BREAKDOWN")
prop.table(table(metro.tracts.2000.final$int_ses_rel_d, useNA = "ifany"))

paste("2000 HIGH JT DIVERSITY BREAKDOWN")
prop.table(table(metro.tracts.2000.final$hd_jt, useNA = "ifany"))
paste("2000 MODERATE JT DIVERSITY BREAKDOWN")
prop.table(table(metro.tracts.2000.final$md_jt, useNA = "ifany"))
paste("2000 LOW JT DIVERSITY BREAKDOWN")
prop.table(table(metro.tracts.2000.final$ld_jt, useNA = "ifany"))

paste("2000 INT JT ABS")
prop.table(table(metro.tracts.2000.final$int_jt_abs, useNA = "ifany"))
paste("2000 INT JT REL A")
prop.table(table(metro.tracts.2000.final$int_jt_rel_a, useNA = "ifany"))
paste("2000 INT JT RELF A")
prop.table(table(metro.tracts.2000.final$int_jt_relf_a, useNA = "ifany"))
paste("2000 INT JT REL B")
prop.table(table(metro.tracts.2000.final$int_jt_rel_b, useNA = "ifany"))
paste("2000 INT JT RELF B")
prop.table(table(metro.tracts.2000.final$int_jt_relf_b, useNA = "ifany"))
paste("2000 INT JT REL C")
prop.table(table(metro.tracts.2000.final$int_jt_rel_c, useNA = "ifany"))
paste("2000 INT JT RELF C")
prop.table(table(metro.tracts.2000.final$int_jt_relf_c, useNA = "ifany"))
paste("2000 INT JT REL D")
prop.table(table(metro.tracts.2000.final$int_jt_rel_d, useNA = "ifany"))
paste("2000 INT JT RELF D")
prop.table(table(metro.tracts.2000.final$int_jt_relf_d, useNA = "ifany"))

paste("2000 MAJ RACE")
table(metro.tracts.2000.final$maj_race, useNA = "ifany")

paste("2000 MAJ RACE AND INT ER REL CROSSTAB")
table(metro.tracts.2000.final$maj_race,
      metro.tracts.2000.final$int_er_rel,
      useNA = "ifany")

paste("2000 MAJ RACE AND INT SES CROSSTAB")
table(metro.tracts.2000.final$maj_race,
      metro.tracts.2000.final$int_ses,
      useNA = "ifany")

paste("2000 MAJ RACE AND INT JT RELF A CROSSTAB")
table(metro.tracts.2000.final$maj_race,
      metro.tracts.2000.final$int_jt_relf_a,
      useNA = "ifany")

paste("2000 MAJ SES AND INT RELF A CROSSTAB")
table(metro.tracts.2000.final$maj_ses,
      metro.tracts.2000.final$int_jt_relf_a,
      useNA = "ifany")

##########
## 2010 ## 
##########

load("001_2010_counties_cb.Rda")
all.counties.2010.cb$FIPS <- all.counties.2010.cb$GEOID

## bring on CBSA codes ##
all.counties.2010.allvars.m <- stata.merge(all.counties.2010.cb,
                                           county.t.msa, 
                                           "FIPS")

## check merge ##
paste("MERGE BETWEEN 2010 COUNTY FILE AND COUNTY TO MSA FILE")
table(all.counties.2010.allvars.m$merge.variable, useNA = "ifany")

## keep matches ##
all.counties.2010.allvars <- all.counties.2010.allvars.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable)

## start with unweighted vars ##
agg.msa.2010.nowt <- all.counties.2010.allvars %>%
  select(CBSA,
         pop_total,
         pop_nh,
         pop_nh_white,
         pop_nh_black,
         pop_nh_aian,
         pop_nh_asian,
         pop_nh_nhpi,
         pop_nh_other,
         pop_nh_multi,
         pop_hl) %>%
  group_by(CBSA) %>%
  summarize_at(vars(pop_total:pop_hl), sum, na.rm=T)

## now, weighted vars ##
agg.msa.2010.wt <- all.counties.2010.allvars %>%
  select(CBSA,
         pop_total,
         median_hhld_inc) %>%
  group_by(CBSA) %>%
  mutate(pop_total_msa = sum(pop_total, na.rm=T),
         msa_wt = pop_total/pop_total_msa) %>%
  summarize_at(vars(median_hhld_inc),
               funs(weighted.mean(., msa_wt, na.rm=T)))

## merge ##
agg.msa.2010.m <- stata.merge(agg.msa.2010.wt,
                              agg.msa.2010.nowt,
                              "CBSA")

## check merge ##
paste("MERGE BETWEEN 2010 MSA FILES A AND B")
table(agg.msa.2010.m$merge.variable, useNA = "ifany")

## final file ##
agg.msa.2010 <- agg.msa.2010.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable) %>%
  mutate(per_asian_msa = case_when(pop_total != 0 ~ pop_nh_asian/pop_total,
                                   pop_total == 0 ~ 0),
         per_black_msa = case_when(pop_total != 0 ~ pop_nh_black/pop_total,
                                   pop_total == 0 ~ 0),
         per_hl_msa = case_when(pop_total != 0 ~ pop_hl/pop_total,
                                    pop_total == 0 ~ 0),
         per_white_msa = case_when(pop_total != 0 ~ pop_nh_white/pop_total,
                                   pop_total == 0 ~ 0),
         per_aian_msa = case_when(pop_total != 0 ~ pop_nh_aian/pop_total, 
                                  pop_total == 0 ~ 0),
         per_other_msa = case_when(pop_total != 0 ~ rowSums(cbind(pop_nh_other,pop_nh_nhpi,pop_nh_multi),na.rm=T)/pop_total, 
                                   pop_total == 0 ~ 0),
         log_asian_msa = case_when(pop_nh_asian != 0 ~ log(1/per_asian_msa),
                                   pop_nh_asian == 0 ~ 0),
         log_black_msa = case_when(pop_nh_black != 0 ~ log(1/per_black_msa),
                                   pop_nh_black == 0 ~ 0),
         log_hl_msa = case_when(pop_hl != 0 ~ log(1/per_hl_msa),
                                    pop_hl == 0 ~ 0),
         log_white_msa = case_when(pop_nh_white != 0 ~ log(1/per_white_msa),
                                   pop_nh_white == 0 ~ 0),
         log_aian_msa = case_when(pop_nh_aian != 0 ~ log(1/per_aian_msa),
                                  pop_nh_aian == 0 ~ 0),
         log_other_msa = case_when(rowSums(cbind(pop_nh_other,pop_nh_nhpi,pop_nh_multi),na.rm=T) != 0 ~ log(1/per_other_msa),
                                   rowSums(cbind(pop_nh_other,pop_nh_nhpi,pop_nh_multi),na.rm=T) == 0 ~ 0),
         msa_entropy_er = per_asian_msa*log_asian_msa + 
                          per_black_msa*log_black_msa + 
                          per_hl_msa*log_hl_msa + 
                          per_white_msa*log_white_msa +
                          per_aian_msa*log_aian_msa +
                          per_other_msa*log_other_msa,
         msa_entropy_ers = (msa_entropy_er - min(msa_entropy_er))/(max(msa_entropy_er)-min(msa_entropy_er))) %>%
  rename(pop_total_msa = pop_total,
         median_hhld_inc_msa = median_hhld_inc) %>%
  select(CBSA,
         pop_total_msa,
         per_asian_msa,
         per_black_msa,
         per_hl_msa,
         per_white_msa,
         per_aian_msa,
         per_other_msa,
         msa_entropy_er,
         msa_entropy_ers,
         median_hhld_inc_msa)

##
## now, merge this info back onto tracts ## 
##

metro.tracts.2010.fm <- af.wide %>%
  select(GEOID,
         CBSA,
         ends_with("_2010")) %>%
  rename_at(.vars = vars(ends_with("_2010")),
            .funs = funs(sub("[_]2010$", "", .)))

metro.tracts.2010.wmsa.m <- stata.merge(metro.tracts.2010.fm,
                                        agg.msa.2010,
                                        "CBSA")

## check merge ## 
paste("MERGE BETWEEN 2010 TRACT FILE AND 2010 MSA FILE")
table(metro.tracts.2010.wmsa.m$merge.variable, useNA = "ifany")

## keep matches ##
metro.tracts.2010.wmsa <- metro.tracts.2010.wmsa.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable)

##
## bring on rank H ##
##

rankH.2010 <- read_dta("acs_rank_H_2010.dta")

rankH.2010.pr <- rankH.2010 %>%
  group_by(metro) %>%
  summarize(h4_adj = mean(h4_adj, na.rm=T)) %>%
  mutate(CBSA = as.character(metro)) %>%
  select(CBSA,
         h4_adj) %>%
  rename(msa_H4_adj = h4_adj) 

## merge ## 

metro.tracts.2010.wh4.m <- stata.merge(metro.tracts.2010.wmsa,
                                       rankH.2010.pr,
                                       "CBSA")

## check merge ## 
paste("MERGE BETWEEN 2010 TRACT FILE and 2010 RANK H FILE")
table(metro.tracts.2010.wh4.m$merge.variable, useNA = "ifany")

## keep matches ##
metro.tracts.2010.wh4 <- metro.tracts.2010.wh4.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable) 

##
## calculate msa segregation (mutual information)
##

metro.tracts.2010.D <- metro.tracts.2010.wh4 %>%
  mutate(D_pre_comp_aian = case_when(per_aian !=0 & per_aian_msa != 0 ~ log(per_aian/per_aian_msa),
                                     per_aian == 0 | per_aian_msa == 0 ~ 0),
         D_pre_comp_asian = case_when(per_asian !=0 &  per_asian_msa != 0 ~ log(per_asian/per_asian_msa),
                                      per_asian == 0 | per_asian_msa == 0 ~ 0),
         D_pre_comp_black = case_when(per_black !=0 & per_black_msa != 0 ~ log(per_black/per_black_msa),
                                      per_black == 0 | per_black_msa == 0 ~ 0),
         D_pre_comp_hl = case_when(per_hl !=0 &  per_hl_msa != 0 ~ log(per_hl/per_hl_msa),
                                       per_hl == 0 | per_hl_msa == 0 ~ 0),
         D_pre_comp_other = case_when(per_other !=0 & per_other_msa != 0 ~ log(per_other/per_other_msa),
                                      per_other == 0 | per_other_msa == 0 ~ 0),
         D_pre_comp_white = case_when(per_white !=0 &  per_white_msa != 0 ~ log(per_white/per_white_msa),
                                      per_white == 0 | per_white_msa == 0 ~ 0),
         D_comp_aian = per_aian * D_pre_comp_aian,
         D_comp_asian = per_asian * D_pre_comp_asian,
         D_comp_black = per_black * D_pre_comp_black,
         D_comp_hl = per_hl * D_pre_comp_hl,
         D_comp_other = per_other * D_pre_comp_other,
         D_comp_white = per_white * D_pre_comp_white,
         D_comp_all = rowSums(cbind(D_comp_aian,D_comp_asian,D_comp_black,D_comp_hl,D_comp_other,D_comp_white),na.rm=T),
         D_comp_final = (pop_total/pop_total_msa) * D_comp_all) %>%
  group_by(CBSA) %>%
  mutate(tracts_calc = n(),
         msa_D = sum(D_comp_final,na.rm=T)) %>%
  ungroup()

metro.tracts.2010.D$D_int <- ifelse(metro.tracts.2010.D$D_comp_all < median(metro.tracts.2010.D$D_comp_all,na.rm=T) + 
                                                                     sd(metro.tracts.2010.D$D_comp_all,na.rm=T), 1, 0) 

paste("2010 MUTUAL INFO")
table(metro.tracts.2010.D$D_int, useNA = "ifany")

##
## calculate entropy by SES ##
##

## prep data ##
metro.tracts.2010.incl <- metro.tracts.2010.D %>%
  mutate(vlil = 0.5*median_hhld_inc_msa,
         lil = 0.8*median_hhld_inc_msa,
         mil = median_hhld_inc_msa,
         hmil = 1.2*median_hhld_inc_msa,
         hil = 1.5*median_hhld_inc_msa)

## keep only necessary vars ##
ses.2010 <- metro.tracts.2010.incl %>%
  select(GEOID,
         starts_with("phi"),
         vlil,
         lil,
         mil,
         hmil,
         hil)

## check for missing variables ##
paste("SUMMARY OF 2010 SES FILE")
summary(ses.2010)
paste("MISSINGS IN 2010 SES FILE")
sapply(ses.2010, function(x) sum(is.na(x)))

## input data into function that calculates distribution by HUD income limits ##
pp.ses.2010 <- as.data.frame(lapply(ses.2010[,18:22], hud_group, indata=ses.2010))

## input resulting data into cleaning function ##
pp.ses.2010.fin <- rd.data(pp.ses.2010)

## test ## 
t.ses.2010 <- pp.ses.2010.fin %>%
  mutate(sum = rowSums(across(where(is.numeric))))

paste("CHECK TO SEE IF 2010 SES %s WERE CREATED CORRECTLY")
summary(t.ses.2010$sum)

## calculate SES entropy ##
ses.entropy.2010 <- pp.ses.2010.fin %>%
  mutate_if(is.numeric, funs(ifelse(.>0, .*log(1/.),0))) %>%
  mutate(trt_entropy_ses = rowSums(across(where(is.numeric)))) %>%
  select(GEOID,
         trt_entropy_ses)

paste("2010 SES ENTROPY")
summary(ses.entropy.2010$trt_entropy_ses)

## merge SES entropy onto working data ##
ses.entropy.final.2010.m <- stata.merge(pp.ses.2010.fin,
                                        ses.entropy.2010,
                                        "GEOID")

## check merge ## 
paste("MERGE BETWEEN SES FILE AND SES ENTROPY OUTPUT")
table(ses.entropy.final.2010.m$merge.variable, useNA = "ifany")

## keep matches ##
ses.entropy.final.2010 <- ses.entropy.final.2010.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable)

##
## determine majority/plurality SES group ##
##

ses.entropy.final.2010$maj_ses <- c(colnames(ses.entropy.final.2010[,c("linc",
                                                                       "minc",
                                                                       "hinc")])[apply(ses.entropy.final.2010[,c("linc",
                                                                                                                 "minc",
                                                                                                                 "hinc")],
                                                                                      1,which.max)])

##
## now, create entropy by SES and joint entropy ##
##

pas.joint.2010 <- metro.tracts.2010.incl %>%
  select(GEOID,
         starts_with("as_phi"),
         vlil,
         lil,
         mil,
         hmil,
         hil) %>%
  rename_all(~str_replace(.,"^as_",""))

pbl.joint.2010 <- metro.tracts.2010.incl %>%
  select(GEOID,
         starts_with("bl_phi"),
         vlil,
         lil,
         mil,
         hmil,
         hil) %>%
  rename_all(~str_replace(.,"^bl_",""))

phl.joint.2010 <- metro.tracts.2010.incl %>%
  select(GEOID,
         starts_with("hl_phi"),
         vlil,
         lil,
         mil,
         hmil,
         hil) %>%
  rename_all(~str_replace(.,"^hl_",""))

pwh.joint.2010 <- metro.tracts.2010.incl %>%
  select(GEOID,
         starts_with("nhwh_phi"),
         vlil,
         lil,
         mil,
         hmil,
         hil) %>%
  rename_all(~str_replace(.,"^nhwh_",""))

poth.joint.2010 <- metro.tracts.2010.incl %>%
  select(GEOID,
         starts_with("oth_phi"),
         vlil,
         lil,
         mil,
         hmil,
         hil) %>%
  rename_all(~str_replace(.,"^oth_",""))

aspp.joint.2010 <- as.data.frame(lapply(pas.joint.2010[,18:22], hud_group, indata=pas.joint.2010))
blpp.joint.2010 <- as.data.frame(lapply(pbl.joint.2010[,18:22], hud_group, indata=pbl.joint.2010))
hlpp.joint.2010 <- as.data.frame(lapply(phl.joint.2010[,18:22], hud_group, indata=phl.joint.2010))
whpp.joint.2010 <- as.data.frame(lapply(pwh.joint.2010[,18:22], hud_group, indata=pwh.joint.2010))
othpp.joint.2010 <- as.data.frame(lapply(poth.joint.2010[,18:22], hud_group, indata=poth.joint.2010))

## 
## collect the output
##

joint.ent.dfs.2010 <- list(aspp.joint.2010,
                           blpp.joint.2010,
                           hlpp.joint.2010,
                           whpp.joint.2010,
                           othpp.joint.2010)

## process ##
joint.ent.dfs.res.2010 <- lapply(joint.ent.dfs.2010, rd.data)

as.fin.joint.2010 <- joint.ent.dfs.res.2010[[1]] %>%
  rename_with(~paste0(., "_as"), linc:hinc)

bl.fin.joint.2010 <- joint.ent.dfs.res.2010[[2]] %>%
  rename_with(~paste0(., "_bl"), linc:hinc)

hl.fin.joint.2010 <- joint.ent.dfs.res.2010[[3]] %>%
  rename_with(~paste0(., "_lx"), linc:hinc)

wh.fin.joint.2010 <- joint.ent.dfs.res.2010[[4]] %>%
  rename_with(~paste0(., "_wh"), linc:hinc)

oth.fin.joint.2010 <- joint.ent.dfs.res.2010[[5]] %>%
  rename_with(~paste0(., "_oth"), linc:hinc)

## combine ## 
all.joint.data.2010 <- list(as.fin.joint.2010,
                            bl.fin.joint.2010,
                            hl.fin.joint.2010,
                            wh.fin.joint.2010,
                            oth.fin.joint.2010)

all.joint.2010 <- all.joint.data.2010 %>%
  reduce(full_join, by="GEOID")

## test to see if columns add to 1 ## 
t.joint.2010 <- all.joint.2010 %>%
  mutate(sum = rowSums(across(where(is.numeric))))

paste("CHECK TO SEE IF JT SHARES WERE CREATED CORRECTLY")
summary(t.joint.2010$sum)

##
## create joint entropy measure ## 
##

all.joint.final.2010 <- all.joint.2010 %>%
  mutate_if(is.numeric, funs(ifelse(.>0, .*log(1/.),0))) %>%
  mutate(trt_entropy_joint = rowSums(across(where(is.numeric)))) %>%
  select(GEOID,
         trt_entropy_joint)

paste("2010 JT ENTROPY")
summary(all.joint.final.2010$trt_entropy_joint)

##
## add SES and joint entropy measures ##
##

metro.tracts.2010.sesvars.m <- stata.merge(metro.tracts.2010.D,
                                           ses.entropy.final.2010,
                                           "GEOID")

## check merge ##
paste("MERGE BETWEEN 2010 TRACT FILE AND SES ENTROPY FILE")
table(metro.tracts.2010.sesvars.m$merge.variable, useNA = "ifany")

## keep matches ##
metro.tracts.2010.sesvars <- metro.tracts.2010.sesvars.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable) 

## second merge ##
metro.tracts.2010.jtvars.m <- stata.merge(metro.tracts.2010.sesvars,
                                          all.joint.final.2010,
                                          "GEOID")

## check merge ## 
paste("MERGE BETWEEN 2010 TRACT FILE AND JT ENTROPY FILE")
table(metro.tracts.2010.jtvars.m$merge.variable, useNA = "ifany")


## keep matches, final cleaning ##
metro.tracts.2010.final <- metro.tracts.2010.jtvars.m %>%
  filter(merge.variable == 3) %>%
  mutate(year = 2010) %>%
  select(GEOID,
         CBSA,
         year,
         pop_total,
         pop_nh_aian,
         pop_nh_asian,
         pop_nh_black,
         pop_hl,
         pop_nh_multi,
         pop_nh_nhpi,
         pop_nh_other,
         pop_nh_white,
         per_aian,
         per_asian,
         per_black,
         per_hl,
         per_other,
         per_white,
         per_fb,
         hhs,
         gq_per,
         median_hhld_inc,
         hhld_pov_rt,
         maj_race,
         maj_ses,
         hd_er,
         md_er,
         ld_er,
         tracts_calc,
         D_comp_all,
         D_comp_final,
         D_int,
         msa_D,
         msa_H4_adj,
         trt_entropy_er,
         trt_entropy_ers,
         linc,
         minc,
         hinc,
         trt_entropy_ses,
         trt_entropy_joint,
         pop_total_msa,
         per_aian_msa,
         per_asian_msa,
         per_black_msa,
         per_hl_msa,
         per_other_msa,
         per_white_msa,
         msa_entropy_er,
         msa_entropy_ers,
         median_hhld_inc_msa) %>%
  mutate(trt_entropy_sess = (trt_entropy_ses - min(trt_entropy_ses))/(max(trt_entropy_ses)-min(trt_entropy_ses)),
         trt_entropy_joints = (trt_entropy_joint - min(trt_entropy_joint))/(max(trt_entropy_joint)-min(trt_entropy_joint)),
         mhi_ratio = median_hhld_inc/median_hhld_inc_msa,
         hd_ses = ifelse(linc < 0.4 & 
                           minc < 0.4 & 
                           hinc < 0.4, 1, 0),
         ld_ses = ifelse(linc > (2/3) | 
                           minc > (2/3) | 
                           hinc > (2/3),1,0),
         md_ses = ifelse((linc >= 0.4 |
                            minc >= 0.4 | 
                            hinc >= 0.4) & 
                           (linc <= (2/3) & 
                              minc <= (2/3) & 
                              hinc <= (2/3)), 1, 0),
         hd_jt = ifelse(hd_er ==1 & hd_ses ==1, 1, 0), 
         ld_jt = ifelse(ld_er==1 | ld_ses == 1, 1, 0), 
         md_jt = ifelse(hd_jt==0 & ld_jt == 0, 1, 0),
         div_er = ifelse(hd_er == 1 | md_er == 1, 1, 0),
         int_er_abs = ifelse(md_er == 1 | hd_er == 1, 1, 0),
         int_er_rel = ifelse((md_er == 1 | hd_er == 1) & D_int == 1, 1, 0),
         int_er_relf = ifelse((md_er == 1 | hd_er == 1) & D_int == 1 & 
                                rowSums(cbind(per_black,per_hl),na.rm=T)>=0.2, 1, 0),
         int_ses = ifelse(hd_ses == 1 | md_ses == 1, 1, 0),
         #hhld_pov_rt < 0.4 & vli <= 0.8, 1, 0
         int_ses_rel_a = ifelse(int_ses == 1 & 
                                  mhi_ratio >= 0.5 & 
                                  mhi_ratio <= 2 & 
                                  hhld_pov_rt < 0.4, 1, 0),
         int_ses_rel_b = ifelse(int_ses == 1 & 
                                  mhi_ratio >= 0.5 & 
                                  mhi_ratio <= 2 & 
                                  hhld_pov_rt < 0.2, 1, 0),
         int_ses_rel_c = ifelse(int_ses == 1 & 
                                  mhi_ratio >= 0.67 & 
                                  mhi_ratio <= 1.5 & 
                                  hhld_pov_rt < 0.4, 1, 0),
         int_ses_rel_d = ifelse(int_ses == 1 & 
                                  mhi_ratio >= 0.67 & 
                                  mhi_ratio <= 1.5 & 
                                  hhld_pov_rt < 0.2, 1, 0),
         int_jt_abs = ifelse(int_er_abs == 1 & int_ses == 1, 1, 0),
         int_jt_rel_a = ifelse(int_er_rel == 1 & int_ses_rel_a == 1, 1, 0),
         int_jt_relf_a = ifelse(int_er_relf == 1 & int_ses_rel_a == 1, 1, 0),
         int_jt_rel_b = ifelse(int_er_rel == 1 & int_ses_rel_b == 1, 1, 0),
         int_jt_relf_b = ifelse(int_er_relf == 1 & int_ses_rel_b == 1, 1, 0),
         int_jt_rel_c = ifelse(int_er_rel == 1 & int_ses_rel_c == 1, 1, 0),
         int_jt_relf_c = ifelse(int_er_relf == 1 & int_ses_rel_c == 1, 1, 0),
         int_jt_rel_d = ifelse(int_er_rel == 1 & int_ses_rel_d == 1, 1, 0),
         int_jt_relf_d = ifelse(int_er_relf == 1 & int_ses_rel_d == 1, 1, 0),
         ## adjust $ vars for inflation (All Items CPI-U-RS Annual Averages) ##
         median_hhld_inc = median_hhld_inc * 437.6/337,
         median_hhld_inc_msa = median_hhld_inc_msa * 437.6/337)

## analysis ## 

paste("2010 MUTUAL INFO")
summary(metro.tracts.2010.final$msa_D)
paste("2010 RANK H")
summary(metro.tracts.2010.final$msa_H4_adj)
paste("2010 TRACTS CALC")
summary(metro.tracts.2010.final$tracts_calc)

paste("2010 LOW ER DIVERSITY BREAKDOWN")
prop.table(table(metro.tracts.2010.final$ld_er, useNA = "ifany"))
paste("2010 MODERATE ER DIVERSITY BREAKDOWN")
prop.table(table(metro.tracts.2010.final$md_er, useNA = "ifany"))
paste("2010 HIGH ER DIVERSITY BREAKDOWN")
prop.table(table(metro.tracts.2010.final$hd_er, useNA = "ifany"))
paste("2010 ER DIVERSITY BREAKDOWN")
prop.table(table(metro.tracts.2010.final$div_er, useNA = "ifany"))

paste("2010 INT ER ABS BREAKDOWN")
prop.table(table(metro.tracts.2010.final$int_er_abs, useNA = "ifany"))
paste("2010 INT ER REL BREAKDOWN")
prop.table(table(metro.tracts.2010.final$int_er_rel, useNA = "ifany"))
paste("2010 INT ER RELF BREAKDOWN")
prop.table(table(metro.tracts.2010.final$int_er_relf, useNA = "ifany"))

paste("2010 HIGH SES DIVERSITY")
prop.table(table(metro.tracts.2010.final$hd_ses, useNA = "ifany"))
paste("2010 MODERATE SES DIVERSITY")
prop.table(table(metro.tracts.2010.final$md_ses, useNA = "ifany"))
paste("2010 LOW SES DIVERSITY")
prop.table(table(metro.tracts.2010.final$ld_ses, useNA = "ifany"))

paste("2010 INT SES BREAKDOWN")
prop.table(table(metro.tracts.2010.final$int_ses, useNA = "ifany"))
paste("2010 INT SES REL A BREAKDOWN")
prop.table(table(metro.tracts.2010.final$int_ses_rel_a, useNA = "ifany"))
paste("2010 INT SES REL B BREAKDOWN")
prop.table(table(metro.tracts.2010.final$int_ses_rel_b, useNA = "ifany"))
paste("2010 INT SES REL C BREAKDOWN")
prop.table(table(metro.tracts.2010.final$int_ses_rel_c, useNA = "ifany"))
paste("2010 INT SES REL D BREAKDOWN")
prop.table(table(metro.tracts.2010.final$int_ses_rel_d, useNA = "ifany"))

paste("2010 HIGH JT DIVERSITY BREAKDOWN")
prop.table(table(metro.tracts.2010.final$hd_jt, useNA = "ifany"))
paste("2010 MODERATE JT DIVERSITY BREAKDOWN")
prop.table(table(metro.tracts.2010.final$md_jt, useNA = "ifany"))
paste("2010 LOW JT DIVERSITY BREAKDOWN")
prop.table(table(metro.tracts.2010.final$ld_jt, useNA = "ifany"))

paste("2010 INT JT ABS")
prop.table(table(metro.tracts.2010.final$int_jt_abs, useNA = "ifany"))
paste("2010 INT JT REL A")
prop.table(table(metro.tracts.2010.final$int_jt_rel_a, useNA = "ifany"))
paste("2010 INT JT RELF A")
prop.table(table(metro.tracts.2010.final$int_jt_relf_a, useNA = "ifany"))
paste("2010 INT JT REL B")
prop.table(table(metro.tracts.2010.final$int_jt_rel_b, useNA = "ifany"))
paste("2010 INT JT RELF B")
prop.table(table(metro.tracts.2010.final$int_jt_relf_b, useNA = "ifany"))
paste("2010 INT JT REL C")
prop.table(table(metro.tracts.2010.final$int_jt_rel_c, useNA = "ifany"))
paste("2010 INT JT RELF C")
prop.table(table(metro.tracts.2010.final$int_jt_relf_c, useNA = "ifany"))
paste("2010 INT JT REL D")
prop.table(table(metro.tracts.2010.final$int_jt_rel_d, useNA = "ifany"))
paste("2010 INT JT RELF D")
prop.table(table(metro.tracts.2010.final$int_jt_relf_d, useNA = "ifany"))

paste("2010 MAJ RACE")
table(metro.tracts.2010.final$maj_race, useNA = "ifany")

paste("2010 MAJ RACE AND INT ER REL CROSSTAB")
table(metro.tracts.2010.final$maj_race,
      metro.tracts.2010.final$int_er_rel,
      useNA = "ifany")

paste("2010 MAJ RACE AND INT SES CROSSTAB")
table(metro.tracts.2010.final$maj_race,
      metro.tracts.2010.final$int_ses,
      useNA = "ifany")

paste("2010 MAJ RACE AND INT JT RELF A CROSSTAB")
table(metro.tracts.2010.final$maj_race,
      metro.tracts.2010.final$int_jt_relf_a,
      useNA = "ifany")

paste("2010 MAJ SES AND INT RELF A CROSSTAB")
table(metro.tracts.2010.final$maj_ses,
      metro.tracts.2010.final$int_jt_relf_a,
      useNA = "ifany")


##########
## 2020 ## 
##########

load("001_2020_counties_cb.Rda")
all.counties.2020.cb$FIPS <- all.counties.2020.cb$GEOID

## bring on CBSA codes ##
all.counties.2020.allvars.m <- stata.merge(all.counties.2020.cb,
                                           county.t.msa, 
                                           "FIPS")

## diagnose merge ##
paste("MERGE BETWEEN 2020 COUNTIES AND COUNTY TO MSA FILE")
table(all.counties.2020.allvars.m$merge.variable, useNA = "ifany")

## check: are the non-matching obs all in Puerto Rico? ##
nm2.metro.2020 <- all.counties.2020.allvars.m %>%
  filter(merge.variable == 2)

## the -1 is 09013, the combined CT county (Tolland) ##
paste("ARE ALL THE NON-MATCHING OBS IN PUERTO RICO?")
sum(substr(nm2.metro.2020$FIPS,1,2) == "72") == nrow(nm2.metro.2020) - 1

## keep matches ##
all.counties.2020.allvars <- all.counties.2020.allvars.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable)

## start with unweighted vars ##
agg.msa.2020.nowt <- all.counties.2020.allvars %>%
  select(CBSA,
         pop_total,
         pop_nh,
         pop_nh_white,
         pop_nh_black,
         pop_nh_aian,
         pop_nh_asian,
         pop_nh_nhpi,
         pop_nh_other,
         pop_nh_multi,
         pop_hl) %>%
  group_by(CBSA) %>%
  summarize_at(vars(pop_total:pop_hl), sum, na.rm=T)

## now, weighted vars ##
agg.msa.2020.wt <- all.counties.2020.allvars %>%
  select(CBSA,
         pop_total,
         median_hhld_inc) %>%
  group_by(CBSA) %>%
  mutate(pop_total_msa = sum(pop_total, na.rm=T),
         msa_wt = pop_total/pop_total_msa) %>%
  summarize_at(vars(median_hhld_inc),
               funs(weighted.mean(., msa_wt, na.rm=T)))

## merge ##
agg.msa.2020.m <- stata.merge(agg.msa.2020.wt,
                              agg.msa.2020.nowt,
                              "CBSA")

## check merge ##
paste("MERGE BETWEEN 2020 MSA FILES A AND B")
table(agg.msa.2020.m$merge.variable, useNA = "ifany")

## final file ##
agg.msa.2020 <- agg.msa.2020.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable) %>%
  mutate(per_asian_msa = case_when(pop_total != 0 ~ pop_nh_asian/pop_total,
                                   pop_total == 0 ~ 0),
         per_black_msa = case_when(pop_total != 0 ~ pop_nh_black/pop_total,
                                   pop_total == 0 ~ 0),
         per_hl_msa = case_when(pop_total != 0 ~ pop_hl/pop_total,
                                pop_total == 0 ~ 0),
         per_white_msa = case_when(pop_total != 0 ~ pop_nh_white/pop_total,
                                   pop_total == 0 ~ 0),
         per_aian_msa = case_when(pop_total != 0 ~ pop_nh_aian/pop_total, 
                                  pop_total == 0 ~ 0),
         per_other_msa = case_when(pop_total != 0 ~ rowSums(cbind(pop_nh_other,pop_nh_nhpi,pop_nh_multi),na.rm=T)/pop_total, 
                                   pop_total == 0 ~ 0),
         log_asian_msa = case_when(pop_nh_asian != 0 ~ log(1/per_asian_msa),
                                   pop_nh_asian == 0 ~ 0),
         log_black_msa = case_when(pop_nh_black != 0 ~ log(1/per_black_msa),
                                   pop_nh_black == 0 ~ 0),
         log_hl_msa = case_when(pop_hl != 0 ~ log(1/per_hl_msa),
                                pop_hl == 0 ~ 0),
         log_white_msa = case_when(pop_nh_white != 0 ~ log(1/per_white_msa),
                                   pop_nh_white == 0 ~ 0),
         log_aian_msa = case_when(pop_nh_aian != 0 ~ log(1/per_aian_msa),
                                  pop_nh_aian == 0 ~ 0),
         log_other_msa = case_when(rowSums(cbind(pop_nh_other,pop_nh_nhpi,pop_nh_multi),na.rm=T) != 0 ~ log(1/per_other_msa),
                                   rowSums(cbind(pop_nh_other,pop_nh_nhpi,pop_nh_multi),na.rm=T) == 0 ~ 0),
         msa_entropy_er = per_asian_msa*log_asian_msa + 
                          per_black_msa*log_black_msa + 
                          per_hl_msa*log_hl_msa + 
                          per_white_msa*log_white_msa +
                          per_aian_msa*log_aian_msa +
                          per_other_msa*log_other_msa,
         msa_entropy_ers = (msa_entropy_er - min(msa_entropy_er))/(max(msa_entropy_er)-min(msa_entropy_er))) %>%
  rename(pop_total_msa = pop_total,
         median_hhld_inc_msa = median_hhld_inc) %>%
  select(CBSA,
         pop_total_msa,
         per_asian_msa,
         per_black_msa,
         per_hl_msa,
         per_white_msa,
         per_aian_msa,
         per_other_msa,
         msa_entropy_er,
         msa_entropy_ers,
         median_hhld_inc_msa)

## save file for use in 009_sc later on ## 
save(agg.msa.2020,
     file = "002_agg_msa_2020.Rda")

##
## now, merge this info back onto tracts ## 
##

metro.tracts.2020.fm <- af.wide %>%
  select(GEOID,
         CBSA,
         ends_with("_2020")) %>%
  rename_at(.vars = vars(ends_with("_2020")),
            .funs = funs(sub("[_]2020$", "", .)))

metro.tracts.2020.wmsa.m <- stata.merge(metro.tracts.2020.fm,
                                        agg.msa.2020,
                                        "CBSA")

## check merge ## 
paste("MERGE BETWEEN 2020 TRACT FILE AND 2020 MSA FILE")
table(metro.tracts.2020.wmsa.m$merge.variable, useNA = "ifany")

## keep matches ##
metro.tracts.2020.wmsa <- metro.tracts.2020.wmsa.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable)

##
## bring on rank H ##
##

rankH.2020 <- read_dta("acs_rank_H_2022.dta")

rankH.2020.pr <- rankH.2020 %>%
  group_by(metro) %>%
  summarize(h4_adj = mean(h4_adj, na.rm=T)) %>%
  mutate(CBSA = as.character(metro)) %>%
  select(CBSA,
         h4_adj) %>%
  rename(msa_H4_adj = h4_adj) 

## merge ## 
metro.tracts.2020.wh4.m <- stata.merge(metro.tracts.2020.wmsa,
                                       rankH.2020.pr,
                                       "CBSA")

## check merge ## 
paste("MERGE BETWEEN 2020 TRACT FILE AND 2020 RANK H FILE")
table(metro.tracts.2020.wh4.m$merge.variable, useNA = "ifany")

## keep 1,3 ##
metro.tracts.2020.wh4 <- metro.tracts.2020.wh4.m %>%
  filter(merge.variable %in% c(1,3)) %>%
  select(-merge.variable) 

##
## calculate msa segregation (mutual information)
##

metro.tracts.2020.D <- metro.tracts.2020.wh4 %>%
  mutate(D_pre_comp_aian = case_when(per_aian !=0 & per_aian_msa != 0 ~ log(per_aian/per_aian_msa),
                                     per_aian == 0 | per_aian_msa == 0 ~ 0),
         D_pre_comp_asian = case_when(per_asian !=0 &  per_asian_msa != 0 ~ log(per_asian/per_asian_msa),
                                      per_asian == 0 | per_asian_msa == 0 ~ 0),
         D_pre_comp_black = case_when(per_black !=0 & per_black_msa != 0 ~ log(per_black/per_black_msa),
                                      per_black == 0 | per_black_msa == 0 ~ 0),
         D_pre_comp_hl = case_when(per_hl !=0 &  per_hl_msa != 0 ~ log(per_hl/per_hl_msa),
                                       per_hl == 0 | per_hl_msa == 0 ~ 0),
         D_pre_comp_other = case_when(per_other !=0 & per_other_msa != 0 ~ log(per_other/per_other_msa),
                                      per_other == 0 | per_other_msa == 0 ~ 0),
         D_pre_comp_white = case_when(per_white !=0 &  per_white_msa != 0 ~ log(per_white/per_white_msa),
                                      per_white == 0 | per_white_msa == 0 ~ 0),
         D_comp_aian = per_aian * D_pre_comp_aian,
         D_comp_asian = per_asian * D_pre_comp_asian,
         D_comp_black = per_black * D_pre_comp_black,
         D_comp_hl = per_hl * D_pre_comp_hl,
         D_comp_other = per_other * D_pre_comp_other,
         D_comp_white = per_white * D_pre_comp_white,
         D_comp_all = rowSums(cbind(D_comp_aian,D_comp_asian,D_comp_black,D_comp_hl,D_comp_other,D_comp_white),na.rm=T),
         D_comp_final = (pop_total/pop_total_msa) * D_comp_all) %>%
  group_by(CBSA) %>%
  mutate(tracts_calc = n(),
         msa_D = sum(D_comp_final,na.rm=T)) %>%
  ungroup()

metro.tracts.2020.D$D_int <- ifelse(metro.tracts.2020.D$D_comp_all < median(metro.tracts.2020.D$D_comp_all,na.rm=T) + 
                                                                     sd(metro.tracts.2020.D$D_comp_all,na.rm=T), 1, 0) 

paste("2020 MUTUAL INFO")
table(metro.tracts.2020.D$D_int, useNA = "ifany")


## calculate entropy by SES ##

## prep data ##
metro.tracts.2020.incl <- metro.tracts.2020.D %>%
  mutate(vlil = 0.5*median_hhld_inc_msa,
         lil = 0.8*median_hhld_inc_msa,
         mil = median_hhld_inc_msa,
         hmil = 1.2*median_hhld_inc_msa,
         hil = 1.5*median_hhld_inc_msa)

## keep only necessary vars ##
ses.2020 <- metro.tracts.2020.incl %>%
  select(GEOID,
         starts_with("phi"),
         vlil,
         lil,
         mil,
         hmil,
         hil)

## check for missing variables ##
paste("SUMMARY OF 2020 SES FILE")
summary(ses.2020)
paste("MISSINGS IN 2020 SES FILE")
sapply(ses.2020, function(x) sum(is.na(x)))

## input data into function that calculates distribution by HUD income limits ##
pp.ses.2020 <- as.data.frame(lapply(ses.2020[,18:22], hud_group, indata=ses.2020))

## input resulting data into cleaning function ##
pp.ses.2020.fin <- rd.data(pp.ses.2020)

## test ## 
t.ses.2020 <- pp.ses.2020.fin %>%
  mutate(sum = rowSums(across(where(is.numeric))))

paste("CHECK TO SEE IF SES SHARES WERE CREATED CORRECTLY")
summary(t.ses.2020$sum)

## calculate SES entropy ##
ses.entropy.2020 <- pp.ses.2020.fin %>%
  mutate_if(is.numeric, funs(ifelse(.>0, .*log(1/.),0))) %>%
  mutate(trt_entropy_ses = rowSums(across(where(is.numeric)))) %>%
  select(GEOID,
         trt_entropy_ses)

paste("2020 SES ENTROPY")
summary(ses.entropy.2020$trt_entropy_ses)

## merge SES entropy onto working data ##
ses.entropy.final.2020.m <- stata.merge(pp.ses.2020.fin,
                                        ses.entropy.2020,
                                        "GEOID")

## check merge ## 
paste("MERGE BETWEEN 2020 SES FILE AND 2020 SES ENTROPY OUTPUT")
table(ses.entropy.final.2020.m$merge.variable, useNA = "ifany")

## keep matches ##
ses.entropy.final.2020 <- ses.entropy.final.2020.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable)

##
## determine majority/plurality SES group ##
##

ses.entropy.final.2020$maj_ses <- c(colnames(ses.entropy.final.2020[,c("linc",
                                                                       "minc",
                                                                       "hinc")])[apply(ses.entropy.final.2020[,c("linc",
                                                                                                                 "minc",
                                                                                                                 "hinc")],
                                                                                      1,which.max)])


##
## now, create entropy by SES and joint entropy ##
##

pas.joint.2020 <- metro.tracts.2020.incl %>%
  select(GEOID,
         starts_with("as_phi"),
         vlil,
         lil,
         mil,
         hmil,
         hil) %>%
  rename_all(~str_replace(.,"^as_",""))

pbl.joint.2020 <- metro.tracts.2020.incl %>%
  select(GEOID,
         starts_with("bl_phi"),
         vlil,
         lil,
         mil,
         hmil,
         hil) %>%
  rename_all(~str_replace(.,"^bl_",""))

phl.joint.2020 <- metro.tracts.2020.incl %>%
  select(GEOID,
         starts_with("hl_phi"),
         vlil,
         lil,
         mil,
         hmil,
         hil) %>%
  rename_all(~str_replace(.,"^hl_",""))

pwh.joint.2020 <- metro.tracts.2020.incl %>%
  select(GEOID,
         starts_with("nhwh_phi"),
         vlil,
         lil,
         mil,
         hmil,
         hil) %>%
  rename_all(~str_replace(.,"^nhwh_",""))

poth.joint.2020 <- metro.tracts.2020.incl %>%
  select(GEOID,
         starts_with("oth_phi"),
         vlil,
         lil,
         mil,
         hmil,
         hil) %>%
  rename_all(~str_replace(.,"^oth_",""))

aspp.joint.2020 <- as.data.frame(lapply(pas.joint.2020[,18:22], hud_group, indata=pas.joint.2020))
blpp.joint.2020 <- as.data.frame(lapply(pbl.joint.2020[,18:22], hud_group, indata=pbl.joint.2020))
hlpp.joint.2020 <- as.data.frame(lapply(phl.joint.2020[,18:22], hud_group, indata=phl.joint.2020))
whpp.joint.2020 <- as.data.frame(lapply(pwh.joint.2020[,18:22], hud_group, indata=pwh.joint.2020))
othpp.joint.2020 <- as.data.frame(lapply(poth.joint.2020[,18:22], hud_group, indata=poth.joint.2020))

##
## collect the output 
##

joint.ent.dfs.2020 <- list(aspp.joint.2020,
                           blpp.joint.2020,
                           hlpp.joint.2020,
                           whpp.joint.2020,
                           othpp.joint.2020)


joint.ent.dfs.res.2020 <- lapply(joint.ent.dfs.2020, rd.data)

as.fin.joint.2020 <- joint.ent.dfs.res.2020[[1]] %>%
  rename_with(~paste0(., "_as"), linc:hinc)

bl.fin.joint.2020 <- joint.ent.dfs.res.2020[[2]] %>%
  rename_with(~paste0(., "_bl"), linc:hinc)

hl.fin.joint.2020 <- joint.ent.dfs.res.2020[[3]] %>%
  rename_with(~paste0(., "_lx"), linc:hinc)

wh.fin.joint.2020 <- joint.ent.dfs.res.2020[[4]] %>%
  rename_with(~paste0(., "_wh"), linc:hinc)

oth.fin.joint.2020 <- joint.ent.dfs.res.2020[[5]] %>%
  rename_with(~paste0(., "_oth"), linc:hinc)

## combine ## 
all.joint.data.2020 <- list(as.fin.joint.2020,
                            bl.fin.joint.2020,
                            hl.fin.joint.2020,
                            wh.fin.joint.2020,
                            oth.fin.joint.2020)

all.joint.2020 <- all.joint.data.2020 %>%
  reduce(full_join, by="GEOID")

## test to see if columns add to 1 ## 
t.joint.2020 <- all.joint.2020 %>%
  mutate(sum = rowSums(across(where(is.numeric))))

paste("CHECK TO SEE IF JT SHARES WERE CREATED CORRECTLY")
summary(t.joint.2020$sum)

##
## create joint entropy measure 
##

all.joint.final.2020 <- all.joint.2020 %>%
  mutate_if(is.numeric, funs(ifelse(.>0, .*log(1/.),0))) %>%
  mutate(trt_entropy_joint = rowSums(across(where(is.numeric)))) %>%
  select(GEOID,
         trt_entropy_joint)

paste("2020 JT ENTROPY")
summary(all.joint.final.2020$trt_entropy_joint)

##
## add SES and joint entropy measures ##
##

metro.tracts.2020.sesvars.m <- stata.merge(metro.tracts.2020.D,
                                           ses.entropy.final.2020,
                                           "GEOID")

## check merge ##
paste("MERGE BETWEEN 2020 TRACT FILE AND SES ENTROPY")
table(metro.tracts.2020.sesvars.m$merge.variable, useNA = "ifany")

## keep matches ##
metro.tracts.2020.sesvars <- metro.tracts.2020.sesvars.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable) 

## second merge ##
metro.tracts.2020.jtvars.m <- stata.merge(metro.tracts.2020.sesvars,
                                          all.joint.final.2020,
                                          "GEOID")

## check merge ## 
paste("MERGE BETWEEN 2020 TRACT FILE AND JT ENTROPY")
table(metro.tracts.2020.jtvars.m$merge.variable, useNA = "ifany")

## keep matches, final cleaning ##
metro.tracts.2020.final <- metro.tracts.2020.jtvars.m %>%
  filter(merge.variable == 3) %>%
  mutate(year = 2020) %>%
  select(GEOID,
         CBSA,
         year,
         pop_total,
         pop_nh_aian, 
         pop_nh_asian,
         pop_nh_black,
         pop_hl,
         pop_nh_multi,
         pop_nh_nhpi,
         pop_nh_other,
         pop_nh_white,
         per_aian,
         per_asian,
         per_black,
         per_hl,
         per_other,
         per_white,
         per_fb,
         hhs,
         gq_per,
         median_hhld_inc,
         hhld_pov_rt,
         maj_race,
         maj_ses,
         hd_er,
         md_er,
         ld_er,
         tracts_calc,
         D_comp_all,
         D_comp_final,
         D_int,
         msa_D,
         msa_H4_adj,
         trt_entropy_er,
         trt_entropy_ers,
         linc,
         minc,
         hinc,
         trt_entropy_ses,
         trt_entropy_joint,
         pop_total_msa,
         per_aian_msa,
         per_asian_msa,
         per_black_msa,
         per_hl_msa,
         per_other_msa,
         per_white_msa,
         msa_entropy_er,
         msa_entropy_ers,
         median_hhld_inc_msa) %>%
  mutate(trt_entropy_sess = (trt_entropy_ses - min(trt_entropy_ses))/(max(trt_entropy_ses)-min(trt_entropy_ses)),
         trt_entropy_joints = (trt_entropy_joint - min(trt_entropy_joint))/(max(trt_entropy_joint)-min(trt_entropy_joint)),
         mhi_ratio = median_hhld_inc/median_hhld_inc_msa,
         hd_ses = ifelse(linc < 0.4 & 
                           minc < 0.4 & 
                           hinc < 0.4, 1, 0),
         ld_ses = ifelse(linc > (2/3) | 
                           minc > (2/3) | 
                           hinc > (2/3),1,0),
         md_ses = ifelse((linc >= 0.4 |
                            minc >= 0.4 | 
                            hinc >= 0.4) & 
                           (linc <= (2/3) & 
                              minc <= (2/3) & 
                              hinc <= (2/3)), 1, 0),
         hd_jt = ifelse(hd_er ==1 & hd_ses ==1, 1, 0), 
         ld_jt = ifelse(ld_er==1 | ld_ses == 1, 1, 0), 
         md_jt = ifelse(hd_jt==0 & ld_jt == 0, 1, 0),
         div_er = ifelse(hd_er == 1 | md_er == 1, 1, 0),
         int_er_abs = ifelse(md_er == 1 | hd_er == 1, 1, 0),
         int_er_rel = ifelse((md_er == 1 | hd_er == 1) & D_int == 1, 1, 0),
         int_er_relf = ifelse((md_er == 1 | hd_er == 1) & D_int == 1 & 
                                rowSums(cbind(per_black,per_hl),na.rm=T)>=0.2, 1, 0),
         int_ses = ifelse(hd_ses == 1 | md_ses == 1, 1, 0),
         int_ses_rel_a = ifelse(int_ses == 1 & 
                                mhi_ratio >= 0.5 & 
                                mhi_ratio <= 2 & 
                                hhld_pov_rt < 0.4, 1, 0),
         int_ses_rel_b = ifelse(int_ses == 1 & 
                                  mhi_ratio >= 0.5 & 
                                  mhi_ratio <= 2 & 
                                  hhld_pov_rt < 0.2, 1, 0),
         int_ses_rel_c = ifelse(int_ses == 1 & 
                                  mhi_ratio >= 0.67 & 
                                  mhi_ratio <= 1.5 & 
                                  hhld_pov_rt < 0.4, 1, 0),
         int_ses_rel_d = ifelse(int_ses == 1 & 
                                  mhi_ratio >= 0.67 & 
                                  mhi_ratio <= 1.5 & 
                                  hhld_pov_rt < 0.2, 1, 0),
         int_jt_abs = ifelse(int_er_abs == 1 & int_ses == 1, 1, 0),
         int_jt_rel_a = ifelse(int_er_rel == 1 & int_ses_rel_a == 1, 1, 0),
         int_jt_relf_a = ifelse(int_er_relf == 1 & int_ses_rel_a == 1, 1, 0),
         int_jt_rel_b = ifelse(int_er_rel == 1 & int_ses_rel_b == 1, 1, 0),
         int_jt_relf_b = ifelse(int_er_relf == 1 & int_ses_rel_b == 1, 1, 0),
         int_jt_rel_c = ifelse(int_er_rel == 1 & int_ses_rel_c == 1, 1, 0),
         int_jt_relf_c = ifelse(int_er_relf == 1 & int_ses_rel_c == 1, 1, 0),
         int_jt_rel_d = ifelse(int_er_rel == 1 & int_ses_rel_d == 1, 1, 0),
         int_jt_relf_d = ifelse(int_er_relf == 1 & int_ses_rel_d == 1, 1, 0))

#save(metro.tracts.2020.final,
#     file = "003_metro_tracts_final_2020.Rda")

## analysis ## 

paste("2020 MUTUAL INFO")
summary(metro.tracts.2020.final$msa_D)
paste("2020 RANK H")
summary(metro.tracts.2020.final$msa_H4_adj)
paste("2020 TRACTS CALC")
summary(metro.tracts.2020.final$tracts_calc)

paste("2020 LOW ER DIVERSITY BREAKDOWN")
prop.table(table(metro.tracts.2020.final$ld_er, useNA = "ifany"))
paste("2020 MODERATE ER DIVERSITY BREAKDOWN")
prop.table(table(metro.tracts.2020.final$md_er, useNA = "ifany"))
paste("2020 HIGH ER DIVERSITY BREAKDOWN")
prop.table(table(metro.tracts.2020.final$hd_er, useNA = "ifany"))
paste("2020 ER DIVERSITY BREAKDOWN")
prop.table(table(metro.tracts.2020.final$div_er, useNA = "ifany"))

paste("2020 INT ER ABS BREAKDOWN")
prop.table(table(metro.tracts.2020.final$int_er_abs, useNA = "ifany"))
paste("2020 INT ER REL BREAKDOWN")
prop.table(table(metro.tracts.2020.final$int_er_rel, useNA = "ifany"))
paste("2020 INT ER RELF BREAKDOWN")
prop.table(table(metro.tracts.2020.final$int_er_relf, useNA = "ifany"))

paste("2020 HIGH SES DIVERSITY")
prop.table(table(metro.tracts.2020.final$hd_ses, useNA = "ifany"))
paste("2020 MODERATE SES DIVERSITY")
prop.table(table(metro.tracts.2020.final$md_ses, useNA = "ifany"))
paste("2020 LOW SES DIVERSITY")
prop.table(table(metro.tracts.2020.final$ld_ses, useNA = "ifany"))

paste("2020 INT SES BREAKDOWN")
prop.table(table(metro.tracts.2020.final$int_ses, useNA = "ifany"))
paste("2020 INT SES REL A BREAKDOWN")
prop.table(table(metro.tracts.2020.final$int_ses_rel_a, useNA = "ifany"))
paste("2020 INT SES REL B BREAKDOWN")
prop.table(table(metro.tracts.2020.final$int_ses_rel_b, useNA = "ifany"))
paste("2020 INT SES REL C BREAKDOWN")
prop.table(table(metro.tracts.2020.final$int_ses_rel_c, useNA = "ifany"))
paste("2020 INT SES REL D BREAKDOWN")
prop.table(table(metro.tracts.2020.final$int_ses_rel_d, useNA = "ifany"))

paste("2020 HIGH JT DIVERSITY BREAKDOWN")
prop.table(table(metro.tracts.2020.final$hd_jt, useNA = "ifany"))
paste("2020 MODERATE JT DIVERSITY BREAKDOWN")
prop.table(table(metro.tracts.2020.final$md_jt, useNA = "ifany"))
paste("2020 LOW JT DIVERSITY BREAKDOWN")
prop.table(table(metro.tracts.2020.final$ld_jt, useNA = "ifany"))

paste("2020 INT JT ABS")
prop.table(table(metro.tracts.2020.final$int_jt_abs, useNA = "ifany"))
paste("2020 INT JT REL A")
prop.table(table(metro.tracts.2020.final$int_jt_rel_a, useNA = "ifany"))
paste("2020 INT JT RELF A")
prop.table(table(metro.tracts.2020.final$int_jt_relf_a, useNA = "ifany"))
paste("2020 INT JT REL B")
prop.table(table(metro.tracts.2020.final$int_jt_rel_b, useNA = "ifany"))
paste("2020 INT JT RELF B")
prop.table(table(metro.tracts.2020.final$int_jt_relf_b, useNA = "ifany"))
paste("2020 INT JT REL C")
prop.table(table(metro.tracts.2020.final$int_jt_rel_c, useNA = "ifany"))
paste("2020 INT JT RELF C")
prop.table(table(metro.tracts.2020.final$int_jt_relf_c, useNA = "ifany"))
paste("2020 INT JT REL D")
prop.table(table(metro.tracts.2020.final$int_jt_rel_d, useNA = "ifany"))
paste("2020 INT JT RELF D")
prop.table(table(metro.tracts.2020.final$int_jt_relf_d, useNA = "ifany"))

paste("2020 MAJ RACE")
table(metro.tracts.2020.final$maj_race, useNA = "ifany")

paste("2020 MAJ RACE AND INT ER REL CROSSTAB")
table(metro.tracts.2020.final$maj_race,
      metro.tracts.2020.final$int_er_rel,
      useNA = "ifany")

paste("2020 MAJ RACE AND INT SES CROSSTAB")
table(metro.tracts.2020.final$maj_race,
      metro.tracts.2020.final$int_ses,
      useNA = "ifany")

paste("2020 MAJ RACE AND INT JT RELF A CROSSTAB")
table(metro.tracts.2020.final$maj_race,
      metro.tracts.2020.final$int_jt_relf_a,
      useNA = "ifany")

paste("2020 MAJ SES AND INT RELF A CROSSTAB")
table(metro.tracts.2020.final$maj_ses,
      metro.tracts.2020.final$int_jt_relf_a,
      useNA = "ifany")

###################
## now long file ##
###################

af.long.temp <- rbind(metro.tracts.2000.final,
                      metro.tracts.2010.final,
                      metro.tracts.2020.final)

af.long <- af.long.temp %>%
  filter(GEOID %in% af.wide$GEOID)

paste("ARE THERE 3X OBS IN THE AF WIDE FILE?")
nrow(af.long) == 3 * nrow(af.wide)

##
## final processing for long file ##
##

af.long.final <- af.long %>%
  mutate(## create hard hit indicators ##
         sfips = substr(GEOID,1,2),
         hhf = ifelse(sfips %in% c("01","04","06","11","12",
                                   "13","17","18","21","26",
                                   "28","32","34","37","39",
                                   "41","44","47"),1,0))

paste("YEAR BREAKDOWN")
table(af.long.final$year)
paste("CROSSTAB OF YEAR AND HHF STATUS")
table(af.long.final$sfips, af.long.final$hhf)

##
## transpose to create wide analytic file ##
##

af.wide.final <- af.long.final %>%
  gather(vars, value, 4:ncol(af.long.final), convert=T) %>%
  unite(temp, vars, year) %>%
  spread(temp, value, convert=T)

##
## distribution of joint entropy ## 
##

seg.data <- data.frame(index = c(af.wide.final$trt_entropy_joints_2000[af.wide.final$int_jt_relf_a_2000 == 0],
                                 af.wide.final$trt_entropy_joints_2010[af.wide.final$int_jt_relf_a_2010 == 0],
                                 af.wide.final$trt_entropy_joints_2020[af.wide.final$int_jt_relf_a_2020 == 0]),
                        gr = c(rep("2000",length(af.wide.final$trt_entropy_joints_2000[af.wide.final$int_jt_relf_a_2000 == 0])),
                               rep("2010",length(af.wide.final$trt_entropy_joints_2010[af.wide.final$int_jt_relf_a_2010 == 0])),
                               rep("2020",length(af.wide.final$trt_entropy_joints_2020[af.wide.final$int_jt_relf_a_2020 == 0]))))

seg.data$gr_pad <- paste0("Seg ", seg.data$gr)

int.data <- data.frame(index = c(af.wide.final$trt_entropy_joints_2000[af.wide.final$int_jt_relf_a_2000 == 1],
                                 af.wide.final$trt_entropy_joints_2010[af.wide.final$int_jt_relf_a_2010 == 1],
                                 af.wide.final$trt_entropy_joints_2020[af.wide.final$int_jt_relf_a_2020 == 1]),
                       gr = c(rep("2000",length(af.wide.final$trt_entropy_joints_2000[af.wide.final$int_jt_relf_a_2000 == 1])),
                              rep("2010",length(af.wide.final$trt_entropy_joints_2010[af.wide.final$int_jt_relf_a_2010 == 1])),
                              rep("2020",length(af.wide.final$trt_entropy_joints_2020[af.wide.final$int_jt_relf_a_2020 == 1]))))

int.data$gr_pad <- paste0("Int ", int.data$gr)

edata <- rbind(seg.data,
               int.data)

edata$gr_fin <- factor(edata$gr_pad,
                       levels = c("Seg 2000",
                                  "Seg 2010",
                                   "Seg 2020",
                                   "Int 2000",
                                   "Int 2010",
                                   "Int 2020"))

##
## graph the distributions
## this is appendix figure a1
##

ggplot(edata, 
  aes(x = index, y = gr_fin, fill = stat(x))) +
  geom_density_ridges_gradient(scale = 3, size = 0.3, rel_min_height = 0.01) +
  labs(title = '')  + 
  theme_classic() +
  scale_fill_gradient(name = "Scaled joint entropy", low="red",high="green") + 
  xlab("") + 
  ylab("") + 
  scale_y_discrete(limits = rev(levels(edata$gr_fin)))

##
## check that transpose worked ## 
##

paste("DID THE TRANSPOSE WORK?")
nrow(af.wide.final)*3 == nrow(af.long.final)

##
## test to make sure indices were created correctly ## 
##

test.seg <- af.wide.final %>%
  select(CBSA,
         msa_D_2000,
         msa_D_2010,
         msa_D_2020,
         msa_H4_adj_2000,
         msa_H4_adj_2010,
         msa_H4_adj_2020,
         tracts_calc_2000,
         tracts_calc_2010,
         tracts_calc_2020) %>%
  left_join(cbsa.names, "CBSA") %>%
  unique() %>%
  mutate(good = ifelse(tracts_calc_2000 == tracts_calc_2010 & 
                       tracts_calc_2010 == tracts_calc_2020 & 
                       tracts_calc_2000 == tracts_calc_2020, 1, 0))

paste("FILE CONSTRUCTION TEST")
summary(test.seg)
paste("SAME NUMBER OF TRACTS ACROSS YEARS?")
table(test.seg$good, useNA = "ifany")

## save analytic files ## 

save(af.wide.final,
     file= paste0(data_path, 
                 "/002_af_wide_final.Rda"))

save(af.long.final,
     file= paste0(data_path, 
                 "/002_af_long_final.Rda"))

### END OF PROGRAM ###

sink()
sink(type = "message")
      
